def test_generate_fp16(self): config, input_ids, batch_size = self._get_config_and_data() attention_mask = input_ids.ne(1).to(torch_device) model = FSMTForConditionalGeneration(config).eval().to(torch_device) if torch_device == "cuda": model.half() model.generate(input_ids, attention_mask=attention_mask) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
def test_generate_beam_search(self): input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long().to(torch_device) config = self._get_config() lm_model = FSMTForConditionalGeneration(config).to(torch_device) lm_model.eval() max_length = 5 new_input_ids = lm_model.generate( input_ids.clone(), do_sample=True, num_return_sequences=1, num_beams=2, no_repeat_ngram_size=3, max_length=max_length, ) self.assertEqual(new_input_ids.shape, (input_ids.shape[0], max_length))
def get_model(self, mname): if mname not in self.models_cache: self.models_cache[ mname] = FSMTForConditionalGeneration.from_pretrained( mname).to(torch_device) if torch_device == "cuda": self.models_cache[mname].half() return self.models_cache[mname]
def test_dummy_inputs(self): config, *_ = self._get_config_and_data() model = FSMTForConditionalGeneration(config).eval().to(torch_device) model(**model.dummy_inputs)
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path): # prep assert os.path.exists(fsmt_checkpoint_path) os.makedirs(pytorch_dump_folder_path, exist_ok=True) print(f"Writing results to {pytorch_dump_folder_path}") # handle various types of models checkpoint_file = basename(fsmt_checkpoint_path) fsmt_folder_path = dirname(fsmt_checkpoint_path) cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel models = cls.hub_models() kwargs = {"bpe": "fastbpe", "tokenizer": "moses"} data_name_or_path = "." # note: since the model dump is old, fairseq has upgraded its model some # time later, and it does a whole lot of rewrites and splits on the saved # weights, therefore we can't use torch.load() directly on the model file. # see: upgrade_state_dict(state_dict) in fairseq_model.py print(f"using checkpoint {checkpoint_file}") chkpt = hub_utils.from_pretrained(fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs) args = vars(chkpt["args"]["model"]) src_lang = args["source_lang"] tgt_lang = args["target_lang"] data_root = dirname(pytorch_dump_folder_path) model_dir = basename(pytorch_dump_folder_path) # dicts src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt") tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt") src_dict = Dictionary.load(src_dict_file) src_vocab = rewrite_dict_keys(src_dict.indices) src_vocab_size = len(src_vocab) src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json") print( f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records" ) with open(src_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) # detect whether this is a do_lower_case situation, which can be derived by checking whether we # have at least one uppercase letter in the source vocab do_lower_case = True for k in src_vocab.keys(): if not k.islower(): do_lower_case = False break tgt_dict = Dictionary.load(tgt_dict_file) tgt_vocab = rewrite_dict_keys(tgt_dict.indices) tgt_vocab_size = len(tgt_vocab) tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json") print( f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records" ) with open(tgt_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent)) # merges_file (bpecodes) merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) for fn in ["bpecodes", "code"]: # older fairseq called the merges file "code" fsmt_merges_file = os.path.join(fsmt_folder_path, fn) if os.path.exists(fsmt_merges_file): break with open(fsmt_merges_file, encoding="utf-8") as fin: merges = fin.read() merges = re.sub(r" \d+$", "", merges, 0, re.M) # remove frequency number print(f"Generating {merges_file}") with open(merges_file, "w", encoding="utf-8") as fout: fout.write(merges) # model config fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe - # may have to modify the tokenizer if a different type is used by a future model assert args[ "bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}" assert args[ "tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}" model_conf = { "architectures": ["FSMTForConditionalGeneration"], "model_type": "fsmt", "activation_dropout": args["activation_dropout"], "activation_function": "relu", "attention_dropout": args["attention_dropout"], "d_model": args["decoder_embed_dim"], "dropout": args["dropout"], "init_std": 0.02, "max_position_embeddings": args["max_source_positions"], "num_hidden_layers": args["encoder_layers"], "src_vocab_size": src_vocab_size, "tgt_vocab_size": tgt_vocab_size, "langs": [src_lang, tgt_lang], "encoder_attention_heads": args["encoder_attention_heads"], "encoder_ffn_dim": args["encoder_ffn_embed_dim"], "encoder_layerdrop": args["encoder_layerdrop"], "encoder_layers": args["encoder_layers"], "decoder_attention_heads": args["decoder_attention_heads"], "decoder_ffn_dim": args["decoder_ffn_embed_dim"], "decoder_layerdrop": args["decoder_layerdrop"], "decoder_layers": args["decoder_layers"], "bos_token_id": 0, "pad_token_id": 1, "eos_token_id": 2, "is_encoder_decoder": True, "scale_embedding": not args["no_scale_embedding"], "tie_word_embeddings": args["share_all_embeddings"], } # good hparam defaults to start with model_conf["num_beams"] = 5 model_conf["early_stopping"] = False if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[ model_dir]: model_conf["length_penalty"] = best_score_hparams[model_dir][ "length_penalty"] else: model_conf["length_penalty"] = 1.0 print(f"Generating {fsmt_model_config_file}") with open(fsmt_model_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) # tokenizer config fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) tokenizer_conf = { "langs": [src_lang, tgt_lang], "model_max_length": 1024, "do_lower_case": do_lower_case, } print(f"Generating {fsmt_tokenizer_config_file}") with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f: f.write( json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) # model model = chkpt["models"][0] model_state_dict = model.state_dict() # rename keys to start with 'model.' model_state_dict = OrderedDict( ("model." + k, v) for k, v in model_state_dict.items()) # remove unneeded keys ignore_keys = [ "model.model", "model.encoder.version", "model.decoder.version", "model.encoder_embed_tokens.weight", "model.decoder_embed_tokens.weight", "model.encoder.embed_positions._float_tensor", "model.decoder.embed_positions._float_tensor", ] for k in ignore_keys: model_state_dict.pop(k, None) config = FSMTConfig.from_pretrained(pytorch_dump_folder_path) model_new = FSMTForConditionalGeneration(config) # check that it loads ok model_new.load_state_dict(model_state_dict, strict=False) # save pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) print(f"Generating {pytorch_weights_dump_path}") torch.save(model_state_dict, pytorch_weights_dump_path) print("Conversion is done!") print("\nLast step is to upload the files to s3") print(f"cd {data_root}") print(f"transformers-cli upload {model_dir}")
# # It will be used then as "stas/tiny-wmt19-en-de" # Build from transformers import FSMTTokenizer, FSMTConfig, FSMTForConditionalGeneration mname = "facebook/wmt19-en-de" tokenizer = FSMTTokenizer.from_pretrained(mname) # get the correct vocab sizes, etc. from the master model config = FSMTConfig.from_pretrained(mname) config.update(dict( d_model=4, encoder_layers=1, decoder_layers=1, encoder_ffn_dim=4, decoder_ffn_dim=4, encoder_attention_heads=1, decoder_attention_heads=1)) tiny_model = FSMTForConditionalGeneration(config) print(f"num of params {tiny_model.num_parameters()}") # Test batch = tokenizer(["Making tiny model"], return_tensors="pt") outputs = tiny_model(**batch) print("test output:", len(outputs.logits[0])) # Save mname_tiny = "tiny-wmt19-en-de" tiny_model.half() # makes it smaller tiny_model.save_pretrained(mname_tiny) tokenizer.save_pretrained(mname_tiny) print(f"Generated {mname_tiny}")
pairs = [ ["en", "ru"], ["ru", "en"], ["en", "de"], ["de", "en"], ] for src, tgt in pairs: print(f"Testing {src} -> {tgt}") # to switch to local model #mname = "/code/huggingface/transformers-fair-wmt/data/wmt19-{src}-{tgt}" # s3 uploaded model mname = f"stas/wmt19-{src}-{tgt}" src_sentence = text[src] tgt_sentence = text[tgt] tokenizer = FSMTTokenizer.from_pretrained(mname) model = FSMTForConditionalGeneration.from_pretrained(mname) encoded = tokenizer.encode(src_sentence, return_tensors='pt') #print(encoded) outputs = model.generate(encoded) #print(outputs) decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) #print(decoded) assert decoded == tgt_sentence, f"\n\ngot: {decoded}\nexp: {tgt_sentence}\n"
def get_model(self, mname): model = FSMTForConditionalGeneration.from_pretrained(mname).to( torch_device) if torch_device == "cuda": model.half() return model