def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, **kwargs, ) return RobertaHubInterface(x['args'], x['task'], x['models'][0])
def umberto_wikipedia_uncased(**kwargs): from fairseq import hub_utils from fairseq.models.roberta.hub_interface import RobertaHubInterface x = hub_utils.from_pretrained( model_name_or_path= 'https://mxmdownloads.s3.amazonaws.com/umberto/umberto.wikipedia.uncased.tar.gz', checkpoint_file='model.pt', data_name_or_path='.', bpe='sentencepiece', load_checkpoint_heads=True, **kwargs, ) return RobertaHubInterface(x['args'], x['task'], x['models'][0])
def load_model(cls, model_name: str, lang: str, **kwargs): from fairseq import hub_utils ckpt_dir = download_or_load(model_name, lang) tok_path = download_or_load(f"tokenizers/bpe32k.{lang}.zip", lang) x = hub_utils.from_pretrained(ckpt_dir, "model.pt", ckpt_dir, load_checkpoint_heads=True, **kwargs) return BrainRobertaHubInterface(x["args"], x["task"], x["models"][0], tok_path)
def __init__( self, sent_tokenizer, device: str, ext_model_name: str, config, ): super().__init__(config) ckpt_dir = download_or_load(f"bert/{ext_model_name}", config.lang) tok_path = download_or_load( f"tokenizers/bpe32k.{config.lang}.zip", config.lang, ) x = hub_utils.from_pretrained( ckpt_dir, "model.pt", load_checkpoint_heads=True, ) wrapper = BrainRobertaHubInterface( x["args"], x["task"], x["models"][0], tok_path, ) clf_dict = torch.load( f"{ckpt_dir}/classifier.pt", map_location=device, ) classifier_size = 768 if "base" in config.n_model else 1024 self._device = device self._classifier = nn.Linear(classifier_size, 1).to(device).eval() self._classifier.load_state_dict(clf_dict) self._model = wrapper.model.encoder.sentence_encoder.to(device).eval() if "cuda" in device.type: self._model = self._model.half() self._classifier = self._classifier.half() self._tokenizer = BertSumTokenizer( bpe=wrapper.bpe, dictionary=wrapper.task.source_dictionary, sent_tokenizer=sent_tokenizer, )
def load_model(cls, model_name: str, lang: str, **kwargs): """ Load pre-trained model as RobertaHubInterface. :param model_name: model name from available_models :return: pre-trained model """ from fairseq import hub_utils ckpt_dir = download_or_load(model_name, lang) x = hub_utils.from_pretrained( ckpt_dir, "model.pt", load_checkpoint_heads=True, **kwargs, ) return JabertaHubInterface(x["args"], x["task"], x["models"][0])
def _load_model(self, path: str, bpe: str, bpe_filename:str) -> RobertaHubInterface: if path == "xlmr.large" or path == "xlmr.base": return hub.load("pytorch/fairseq", path, force_reload=True) else: checkpoint_file = "model.pt" if os.path.exists(os.path.join(path, "model.pt")) else "checkpoint_best.pt" loaded = hub_utils.from_pretrained( model_name_or_path=path, checkpoint_file=checkpoint_file, data_name_or_path=path, bpe=bpe, sentencepiece_vocab=os.path.join(path, bpe_filename), sentencepiece_model=os.path.join(path, bpe_filename), load_checkpoint_heads=True, archive_map=RobertaModel.hub_models(), cpu=False ) return RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0])
def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", config_yaml="config.yaml", **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), config_yaml=config_yaml, **kwargs, ) return S2THubInterface(x["args"], x["task"], x["models"][0])
def from_pretrained( cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='bert', **kwargs, ): x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, **kwargs, ) return ProphetNetHubInterface(x['args'], x['task'], x['models'][0])
def from_pretrained(cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="sentencepiece", **kwargs): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, **kwargs, ) return RobertaHubInterface(x["args"], x["task"], x["models"][0])
def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", **kwargs, ): """ Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model file. Downloads and caches the pre-trained model file if needed. The base implementation returns a :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to generate translations or sample from language models. The underlying :class:`~fairseq.models.FairseqModel` can be accessed via the *generator.models* attribute. Other models may override this to implement custom hub interfaces. Args: model_name_or_path (str): either the name of a pre-trained model to load or a path/URL to a pre-trained model state dict checkpoint_file (str, optional): colon-separated list of checkpoint files in the model archive to ensemble (default: 'model.pt') data_name_or_path (str, optional): point args.data to the archive at the given path/URL. Can start with '.' or './' to reuse the model archive path. """ from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), **kwargs, ) cls.upgrade_args(x["args"]) logger.info(x["args"]) return hub_utils.GeneratorHubInterface(x["args"], x["task"], x["models"])
def evaluate_task(self): checkpoints_output_dir = os.path.join("checkpoints", self.model_name, self.task.spec().output_path()) checkpoint_file = "checkpoint_last.pt" if self.task.spec( ).no_dev_set else "checkpoint_best.pt" loaded = hub_utils.from_pretrained( model_name_or_path=checkpoints_output_dir, checkpoint_file=checkpoint_file, data_name_or_path=self.task_output_dir, bpe="sentencepiece", sentencepiece_vocab=os.path.join(self.model_dir, "sentencepiece.bpe.model"), load_checkpoint_heads=True, archive_map=RobertaModel.hub_models()) roberta = RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0]) evaluator = TaskEvaluator(self.task, self.task_id, roberta, self.input_dir, checkpoints_output_dir) return evaluator.evaluate()
def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="gpt2", **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, **kwargs, ) return cls(x["args"], x["task"])
def evaluate_task(self): checkpoints_output_dir = os.path.join("checkpoints", self.model_name, self.task.spec().output_path()) checkpoint_file = "checkpoint_last.pt" if self.task.spec().no_dev_set else "checkpoint_best.pt" model_classes = {"roberta": (RobertaModel, RobertaHubInterface), "bart": (BARTModel, CustomBARTHubInterface)} arch_type = self.arch.split("_")[0] model_class = model_classes[arch_type][0] spm_path = os.path.join(self.model_dir, "sentencepiece.bpe.model") loaded = hub_utils.from_pretrained( model_name_or_path=checkpoints_output_dir, checkpoint_file=checkpoint_file, data_name_or_path=self.task_output_dir, bpe="sentencepiece", sentencepiece_model=spm_path, sentencepiece_vocab=spm_path, load_checkpoint_heads=True, archive_map=model_class.hub_models() ) model_interface = model_classes[arch_type][1] model = model_interface(loaded['args'], loaded['task'], loaded['models'][0]) evaluator = TaskEvaluator(self.task, self.task_id, model, self.input_dir, checkpoints_output_dir) return evaluator.evaluate()
def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", config_yaml="config.yaml", vocoder: str = "griffin_lim", fp16: bool = False, **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), config_yaml=config_yaml, vocoder=vocoder, fp16=fp16, **kwargs, ) return TTSHubInterface(x["args"], x["task"], x["models"][0])
def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="gpt2", sample_break_mode="eos", **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, sample_break_mode=sample_break_mode, **kwargs, ) return BARTHubInterface(x["args"], x["task"], x["models"][0])
def from_pretrained( cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, **kwargs, ) if 'mbart' in model_name_or_path: x['args'].sentencepiece_vocab = os.path.join( model_name_or_path, 'sentence.bpe.model') return BARTHubInterface(x['args'], x['task'], x['models'][0])
def load_model(cls, model_name: str, lang: str, **kwargs): """ Load pre-trained model as RobertaHubInterface. :param model_name: model name from available_models :return: pre-trained model """ from fairseq import hub_utils # cache directory is treated as the home directory for both model and data files ckpt_dir = download_or_load(model_name, lang) x = hub_utils.from_pretrained( ckpt_dir, "model.pt", ckpt_dir, load_checkpoint_heads=True, **kwargs, ) return SegmentBertHubInterface( x["args"], x["task"], x["models"][0], lang, )
def from_pretrained( cls, model_name_or_path, sentencepiece_model="spm_256000.model", checkpoint_file="model.pt", data_name_or_path=".", bpe="sentencepiece", layernorm_embedding=True, **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, sentencepiece_model=os.path.join(model_name_or_path, sentencepiece_model), **kwargs, ) return GENREHubInterface(x["args"], x["task"], x["models"][0])
def initialize(self, ctx): self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda. is_available() else "cpu") logger.debug('Will load from {0}'.format(model_dir)) # Read model serialize/pt file x = hub_utils.from_pretrained(model_dir, "model.pt", DATA_PATH, load_checkpoint_heads=True) model_interface = BrainRobertaHubInterface( x["args"], x["task"], x["models"][0], model_dir, ).to(self.device) tagger = mecab.MeCab() self.model = PororoBertMrc( model_interface, tagger, postprocess_span, TaskConfig("mrc", "ko", "brainbert.base.ko.korquad")) # Read the mapping file, index to object name # mapping_file_path = os.path.join(model_dir, "index_to_name.json") # if os.path.isfile(mapping_file_path): # with open(mapping_file_path) as f: # self.mapping = json.load(f) # else: # logger.warning('Missing the index_to_name.json file. Inference output will not include class name.') self.initialized = True
def from_pretrained( cls, model_path: str, sentencepiece_prefix: str, dictionary_path: str, ): x = hub_utils.from_pretrained("./", checkpoint_file=model_path, archive_map={}, data_name_or_path=dictionary_path, task="multilingual_translation") sp_models = { lang: SentencePieceProcessor( model_file=f"{sentencepiece_prefix}.{lang}.model") for lang in x["task"].langs } return cls( models=x["models"], task=x["task"], cfg=x["args"], sp_models=sp_models, )
def from_pretrained(cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="hf_byte_bpe", bpe_vocab="vocab.json", bpe_merges="merges.txt", bpe_add_prefix_space=False, **kwargs): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, bpe_vocab=bpe_vocab, bpe_merges=bpe_merges, bpe_add_prefix_space=bpe_add_prefix_space, **kwargs, ) return RobertaHubInterface(x["args"], x["task"], x["models"][0])
def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='hf_byte_bpe', bpe_vocab='vocab.json', bpe_merges='merges.txt', bpe_add_prefix_space=False, **kwargs): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, bpe_vocab=bpe_vocab, bpe_merges=bpe_merges, bpe_add_prefix_space=bpe_add_prefix_space, **kwargs, ) return RobertaHubInterface(x['args'], x['task'], x['models'][0])
from fairseq.models.roberta import RobertaHubInterface, RobertaModel import mecab from pororo.models.brainbert.BrainRoBERTa import BrainRobertaHubInterface from pororo.tasks.machine_reading_comprehension import PororoBertMrc from pororo.tasks.utils.base import TaskConfig from pororo.tasks.utils.download_utils import download_or_load from pororo.tasks.utils.tokenizer import CustomTokenizer from pororo.utils import postprocess_span import torch ckpt_dir = download_or_load("bert/brainbert.base.ko.korquad", "ko") tok_path = download_or_load(f"tokenizers/bpe32k.ko.zip", "ko") x = hub_utils.from_pretrained( ckpt_dir, "model.pt", ckpt_dir, load_checkpoint_heads=True ) model = BrainRobertaHubInterface( x["args"], x["task"], x["models"][0], tok_path, ).to(torch.device("cuda")) tagger = mecab.MeCab() final = PororoBertMrc(model, tagger, postprocess_span, TaskConfig("mrc", "ko", "brainbert.base.ko.korquad")) print(final("이름이 뭐야?", "이름은 시리야."))
def to_pytorch(fsmt_checkpoint_path, save_path): assert os.path.exists(fsmt_checkpoint_path) os.makedirs(save_path, exist_ok=True) print(f"Writing results to {save_path}") checkpoint_file = basename(fsmt_checkpoint_path) fsmt_folder_path = dirname(fsmt_checkpoint_path) cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel models = cls.hub_models() kw = {"bpe": "fastbpe", "tokenizer": "moses"} data_name_or_path = "." print(f"using checkpoint {checkpoint_file}") chkpt = hub_utils.from_pretrained(fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kw) args = vars(chkpt["args"]["model"]) src_lang = args["source_lang"] tgt_lang = args["target_lang"] data_root = dirname(save_path) model_dir = basename(save_path) src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt") tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt") src_dict = Dictionary.load(src_dict_file) src_vocab = rewrite_dict_keys(src_dict.indices) s_src_vocab = len(src_vocab) src_vocab_file = os.path.join(save_path, "vocab-src.json") print( f"Generating {src_vocab_file} of {s_src_vocab} of {src_lang} records") with open(src_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) do_lower_case = True for k in src_vocab.keys(): if not k.islower(): do_lower_case = False break tgt_dict = Dictionary.load(tgt_dict_file) tgt_vocab = rewrite_dict_keys(tgt_dict.indices) s_tgt_vocab = len(tgt_vocab) tgt_vocab_file = os.path.join(save_path, "vocab-tgt.json") print( f"Generating {tgt_vocab_file} of {s_tgt_vocab} of {tgt_lang} records") with open(tgt_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent)) merges_file = os.path.join(save_path, VOCAB_FS["merges_file"]) for fn in ["bpecodes", "code"]: # older fairseq called the merges file "code" fsmt_merges_file = os.path.join(fsmt_folder_path, fn) if os.path.exists(fsmt_merges_file): break with open(fsmt_merges_file, encoding="utf-8") as fin: merges = fin.read() merges = re.sub(r" \d+$", "", merges, 0, re.M) # remove frequency number print(f"Generating {merges_file}") with open(merges_file, "w", encoding="utf-8") as fout: fout.write(merges) fsmt_model_config_file = os.path.join(save_path, "config.json") assert args[ "bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}" assert (args["tokenizer"] == "moses" ), f"need to extend tokenizer to support bpe={args['tokenizer']}" model_conf = { "archs": ["FSMTForConditionalGeneration"], "model_type": "fsmt", "drop_act": args["drop_act"], "act_fun": "relu", "drop_attn": args["drop_attn"], "d_hidden": args["decoder_embed_dim"], "drop": args["drop"], "init_std": 0.02, "n_pos": args["max_source_positions"], "n_lays": args["n_enc_lays"], "s_src_vocab": s_src_vocab, "s_tgt_vocab": s_tgt_vocab, "langs": [src_lang, tgt_lang], "n_enc_heads": args["n_enc_heads"], "d_enc_ffn": args["encoder_ffn_embed_dim"], "drop_enc": args["drop_enc"], "n_enc_lays": args["n_enc_lays"], "n_dec_heads": args["n_dec_heads"], "d_dec_ffn": args["decoder_ffn_embed_dim"], "drop_dec": args["drop_dec"], "n_dec_lays": args["n_dec_lays"], "BOS": 0, "PAD": 1, "EOS": 2, "is_enc_dec": True, "scale": not args["no_scale_embedding"], "tie_word_embeds": args["share_all_embeddings"], } model_conf["n_beams"] = 5 model_conf["early_stop"] = False if model_dir in best_score_hparams and "len_penalty" in best_score_hparams[ model_dir]: model_conf["len_penalty"] = best_score_hparams[model_dir][ "len_penalty"] else: model_conf["len_penalty"] = 1.0 print(f"Generating {fsmt_model_config_file}") with open(fsmt_model_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) fsmt_tokenizer_config_file = os.path.join(save_path, TOKENIZER_CONFIG_FILE) tokenizer_conf = { "langs": [src_lang, tgt_lang], "model_max_length": 1024, "do_lower_case": do_lower_case, } print(f"Generating {fsmt_tokenizer_config_file}") with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f: f.write( json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) model = chkpt["models"][0] model_state_dict = model.state_dict() model_state_dict = OrderedDict( ("model." + k, v) for k, v in model_state_dict.items()) ignore_keys = [ "model.model", "model.encoder.version", "model.decoder.version", "model.encoder_embed_tokens.weight", "model.decoder_embed_tokens.weight", "model.encoder.embed_positions._float_tensor", "model.decoder.embed_positions._float_tensor", ] for k in ignore_keys: model_state_dict.pop(k, None) config = PreTrained.from_pretrained(save_path) model_new = ForConditionalGen(config) model_new.load_state_dict(model_state_dict, strict=False) pytorch_weights_dump_path = os.path.join(save_path, WEIGHTS_NAME) print(f"Generating {pytorch_weights_dump_path}") torch.save(model_state_dict, pytorch_weights_dump_path) print("Conversion is done!") print("\nLast step is to upload the files to s3") print(f"cd {data_root}") print(f"transformers-cli upload {model_dir}")
from fairseq.models.roberta import RobertaModel from fairseq import hub_utils from fairseq.models.roberta import RobertaModel, RobertaHubInterface import os from tqdm import tqdm model_path = "polish_roberta_large_no_finetune" loaded = hub_utils.from_pretrained(model_name_or_path=model_path, data_name_or_path=model_path, bpe="sentencepiece", sentencepiece_vocab=os.path.join( model_path, "sentencepiece.bpe.model"), load_checkpoint_heads=True, archive_map=RobertaModel.hub_models(), cpu=False) roberta = RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0]) roberta.eval() roberta.cuda() preds = roberta.fill_mask('Ala <mask>, kota', topk=3) #import pdb; pdb.set_trace() def predict(f_in_path, f_out_path): f_in = open(f_in_path, 'r', newline='\n') f_out = open(f_out_path, 'w', newline='\n') for line in tqdm(f_in, total=19986):
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path): # prep assert os.path.exists(fsmt_checkpoint_path) os.makedirs(pytorch_dump_folder_path, exist_ok=True) print(f"Writing results to {pytorch_dump_folder_path}") # handle various types of models checkpoint_file = basename(fsmt_checkpoint_path) fsmt_folder_path = dirname(fsmt_checkpoint_path) cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel models = cls.hub_models() kwargs = {"bpe": "fastbpe", "tokenizer": "moses"} data_name_or_path = "." # note: since the model dump is old, fairseq has upgraded its model some # time later, and it does a whole lot of rewrites and splits on the saved # weights, therefore we can't use torch.load() directly on the model file. # see: upgrade_state_dict(state_dict) in fairseq_model.py print(f"using checkpoint {checkpoint_file}") chkpt = hub_utils.from_pretrained( fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs ) args = vars(chkpt["args"]["model"]) src_lang = args["source_lang"] tgt_lang = args["target_lang"] data_root = dirname(pytorch_dump_folder_path) model_dir = basename(pytorch_dump_folder_path) # dicts src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt") tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt") src_dict = Dictionary.load(src_dict_file) src_vocab = rewrite_dict_keys(src_dict.indices) src_vocab_size = len(src_vocab) src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json") print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records") with open(src_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) # detect whether this is a do_lower_case situation, which can be derived by checking whether we # have at least one upcase letter in the source vocab do_lower_case = True for k in src_vocab.keys(): if not k.islower(): do_lower_case = False break tgt_dict = Dictionary.load(tgt_dict_file) tgt_vocab = rewrite_dict_keys(tgt_dict.indices) tgt_vocab_size = len(tgt_vocab) tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json") print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records") with open(tgt_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent)) # merges_file (bpecodes) merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) for fn in ["bpecodes", "code"]: # older fairseq called the merges file "code" fsmt_merges_file = os.path.join(fsmt_folder_path, fn) if os.path.exists(fsmt_merges_file): break with open(fsmt_merges_file, encoding="utf-8") as fin: merges = fin.read() merges = re.sub(r" \d+$", "", merges, 0, re.M) # remove frequency number print(f"Generating {merges_file}") with open(merges_file, "w", encoding="utf-8") as fout: fout.write(merges) # model config fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe - # may have to modify the tokenizer if a different type is used by a future model assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}" assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}" model_conf = { "architectures": ["FSMTForConditionalGeneration"], "model_type": "fsmt", "activation_dropout": args["activation_dropout"], "activation_function": "relu", "attention_dropout": args["attention_dropout"], "d_model": args["decoder_embed_dim"], "dropout": args["dropout"], "init_std": 0.02, "max_position_embeddings": args["max_source_positions"], "num_hidden_layers": args["encoder_layers"], "src_vocab_size": src_vocab_size, "tgt_vocab_size": tgt_vocab_size, "langs": [src_lang, tgt_lang], "encoder_attention_heads": args["encoder_attention_heads"], "encoder_ffn_dim": args["encoder_ffn_embed_dim"], "encoder_layerdrop": args["encoder_layerdrop"], "encoder_layers": args["encoder_layers"], "decoder_attention_heads": args["decoder_attention_heads"], "decoder_ffn_dim": args["decoder_ffn_embed_dim"], "decoder_layerdrop": args["decoder_layerdrop"], "decoder_layers": args["decoder_layers"], "bos_token_id": 0, "pad_token_id": 1, "eos_token_id": 2, "is_encoder_decoder": True, "scale_embedding": not args["no_scale_embedding"], "tie_word_embeddings": args["share_all_embeddings"], } # good hparam defaults to start with model_conf["num_beams"] = 5 model_conf["early_stopping"] = False if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]: model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"] else: model_conf["length_penalty"] = 1.0 print(f"Generating {fsmt_model_config_file}") with open(fsmt_model_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) # tokenizer config fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) tokenizer_conf = { "langs": [src_lang, tgt_lang], "model_max_length": 1024, "do_lower_case": do_lower_case, } print(f"Generating {fsmt_tokenizer_config_file}") with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) # model model = chkpt["models"][0] model_state_dict = model.state_dict() # rename keys to start with 'model.' model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items()) # remove unneeded keys ignore_keys = [ "model.model", "model.encoder.version", "model.decoder.version", "model.encoder_embed_tokens.weight", "model.decoder_embed_tokens.weight", "model.encoder.embed_positions._float_tensor", "model.decoder.embed_positions._float_tensor", ] for k in ignore_keys: model_state_dict.pop(k, None) config = FSMTConfig.from_pretrained(pytorch_dump_folder_path) model_new = FSMTForConditionalGeneration(config) # check that it loads ok model_new.load_state_dict(model_state_dict, strict=False) # save pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) print(f"Generating {pytorch_weights_dump_path}") torch.save(model_state_dict, pytorch_weights_dump_path) print("Conversion is done!") print("\nLast step is to upload the files to s3") print(f"cd {data_root}") print(f"transformers-cli upload {model_dir}")
def from_pretrained(path, id2label, **kwargs): x = hub_utils.from_pretrained(path) return RobertaClassifier(x['args'], x['task'], x['models'][0], path, id2label)
model_path = os.path.join(root_path,"checkpoints/") checkpoint_file = "checkpoint77.pt" checkpoint_file = "checkpoint94.pt" checkpoint_file = "checkpoint127.pt" checkpoint_file = "checkpoint_best.pt" vocab_model_file="wikipedia_upper_voc_32000_sen10000000.model" vocab_path = os.path.join(root_path, "vocab", vocab_model_file) #%% loaded = hub_utils.from_pretrained( model_name_or_path=model_path, checkpoint_file=checkpoint_file, data_name_or_path='./', bpe="sentencepiece", sentencepiece_vocab=vocab_path, load_checkpoint_heads=True, archive_map=RobertaModel.hub_models(), cpu=True ) roberta = RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0]) roberta.eval() #%% def print_mask(s, predicted): print(s) for p in predicted: print(f'\t{p[2]} - {p[0]} - confidence {p[1]}')
def __init__(self, args, task): super(BertRanker, self).__init__(args, task) init_model = getattr(args, "pretrained_model", "") self.joint_layers = nn.ModuleList() if os.path.isfile(init_model): print(f"initialize weight from {init_model}") from fairseq import hub_utils x = hub_utils.from_pretrained( os.path.dirname(init_model), checkpoint_file=os.path.basename(init_model), ) in_state_dict = x["models"][0].state_dict() init_args = x["args"].model num_positional_emb = init_args.max_positions + task.dictionary.pad( ) + 1 # follow the setup in roberta self.model = TransformerSentenceEncoder( padding_idx=task.dictionary.pad(), vocab_size=len(task.dictionary), num_encoder_layers=getattr(args, "encoder_layers", init_args.encoder_layers), embedding_dim=init_args.encoder_embed_dim, ffn_embedding_dim=init_args.encoder_ffn_embed_dim, num_attention_heads=init_args.encoder_attention_heads, dropout=init_args.dropout, attention_dropout=init_args.attention_dropout, activation_dropout=init_args.activation_dropout, num_segments=2, # add language embeddings max_seq_len=num_positional_emb, offset_positions_by_padding=False, encoder_normalize_before=True, apply_bert_init=True, activation_fn=init_args.activation_fn, freeze_embeddings=args.freeze_embeddings, n_trans_layers_to_freeze=args.n_trans_layers_to_freeze, ) # still need to learn segment embeddings as we added a second language embedding if args.freeze_embeddings: for p in self.model.segment_embeddings.parameters(): p.requires_grad = False update_init_roberta_model_state(in_state_dict) print("loading weights from the pretrained model") self.model.load_state_dict( in_state_dict, strict=False) # ignore mismatch in language embeddings ffn_embedding_dim = init_args.encoder_ffn_embed_dim num_attention_heads = init_args.encoder_attention_heads dropout = init_args.dropout attention_dropout = init_args.attention_dropout activation_dropout = init_args.activation_dropout activation_fn = init_args.activation_fn classifier_embed_dim = getattr(args, "embed_dim", init_args.encoder_embed_dim) if classifier_embed_dim != init_args.encoder_embed_dim: self.transform_layer = nn.Linear(init_args.encoder_embed_dim, classifier_embed_dim) else: self.model = TransformerSentenceEncoder( padding_idx=task.dictionary.pad(), vocab_size=len(task.dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=args.embed_dim, ffn_embedding_dim=args.ffn_embed_dim, num_attention_heads=args.attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, max_seq_len=task.max_positions() if task.max_positions() else args.tokens_per_sample, num_segments=2, offset_positions_by_padding=False, encoder_normalize_before=args.encoder_normalize_before, apply_bert_init=args.apply_bert_init, activation_fn=args.activation_fn, ) classifier_embed_dim = args.embed_dim ffn_embedding_dim = args.ffn_embed_dim num_attention_heads = args.attention_heads dropout = args.dropout attention_dropout = args.attention_dropout activation_dropout = args.activation_dropout activation_fn = args.activation_fn self.joint_classification = args.joint_classification if args.joint_classification == "sent": if args.joint_normalize_before: self.joint_layer_norm = LayerNorm(classifier_embed_dim) else: self.joint_layer_norm = None self.joint_layers = nn.ModuleList([ TransformerSentenceEncoderLayer( embedding_dim=classifier_embed_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, ) for _ in range(args.num_joint_layers) ]) self.classifier = RobertaClassificationHead( classifier_embed_dim, classifier_embed_dim, 1, # num_classes "tanh", args.classifier_dropout, )