def check_encoder_decoder_model_from_pretrained_configs(
        self,
        config,
        inputs,
        attention_mask,
        encoder_hidden_states,
        decoder_config,
        decoder_input_ids,
        decoder_attention_mask,
        **kwargs
    ):
        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
        self.assertTrue(encoder_decoder_config.decoder.is_decoder)

        enc_dec_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)

        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
        self.assertFalse(enc_dec_model.config.tie_word_embeddings)

        outputs_encoder_decoder = enc_dec_model(
            inputs=inputs,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
        )

        self.assertEqual(
            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
        )
Beispiel #2
0
    def check_encoder_decoder_model_from_pretrained_configs(
            self,
            config,
            attention_mask,
            decoder_config,
            decoder_input_ids,
            decoder_attention_mask,
            input_values=None,
            input_features=None,
            **kwargs):
        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(
            config, decoder_config)
        self.assertTrue(encoder_decoder_config.decoder.is_decoder)

        enc_dec_model = SpeechEncoderDecoderModel(encoder_decoder_config)
        enc_dec_model.to(torch_device)
        enc_dec_model.eval()

        self.assertTrue(enc_dec_model.config.is_encoder_decoder)

        outputs_encoder_decoder = enc_dec_model(
            input_values=input_values,
            input_features=input_features,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
        )

        self.assertEqual(outputs_encoder_decoder["logits"].shape,
                         (decoder_input_ids.shape +
                          (decoder_config.vocab_size, )))
    def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):

        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)

        pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
        fx_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)

        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)

        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
    def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):

        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)

        pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
        fx_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)

        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
        fx_model.params = fx_state

        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
    def check_freeze_feature_encoder(
        self,
        config,
        inputs,
        attention_mask,
        encoder_hidden_states,
        decoder_config,
        decoder_input_ids,
        decoder_attention_mask,
        **kwargs
    ):
        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
        enc_dec_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)
        params = enc_dec_model.params

        def cross_entropy(logits, labels):
            return -jnp.sum(labels * jax.nn.log_softmax(logits, axis=-1), axis=-1)

        # define a dummy loss function for computing the loss over a forward pass
        def compute_loss(
            params,
            inputs,
            attention_mask,
            decoder_input_ids,
            freeze_feature_encoder: bool = False,
        ):
            outputs_enc_dec = enc_dec_model(
                inputs=inputs,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                freeze_feature_encoder=freeze_feature_encoder,
                params=params,
            )
            logits = outputs_enc_dec.logits
            vocab_size = logits.shape[-1]
            loss = cross_entropy(logits, onehot(labels=decoder_input_ids, num_classes=vocab_size)).sum()
            return (loss, logits)

        # transform the loss function to get the gradients
        grad_fn = jax.value_and_grad(compute_loss, has_aux=True)

        # compute the loss, logits, and gradients for the unfrozen model
        (loss, logits), grads = grad_fn(
            params, inputs, attention_mask, decoder_input_ids, freeze_feature_encoder=False
        )

        # compare to the loss, logits and gradients for the frozen model
        (loss_frozen, logits_frozen), grads_frozen = grad_fn(
            params, inputs, attention_mask, decoder_input_ids, freeze_feature_encoder=True
        )

        # ensure that the logits and losses remain precisely equal
        self.assertTrue((logits == logits_frozen).all())
        self.assertEqual(loss, loss_frozen)

        grads = flatten_dict(grads)
        grads_frozen = flatten_dict(grads_frozen)

        # ensure that the dicts of gradients contain the same keys
        self.assertEqual(grads.keys(), grads_frozen.keys())

        # ensure that the gradients of the feature extractor layers are precisely zero when frozen and contain non-zero entries when unfrozen
        feature_extractor_grads = tuple(grads[k] for k in grads if "feature_extractor" in k)
        feature_extractor_grads_frozen = tuple(grads_frozen[k] for k in grads_frozen if "feature_extractor" in k)

        for feature_extractor_grad, feature_extractor_grad_frozen in zip(
            feature_extractor_grads, feature_extractor_grads_frozen
        ):
            self.assertTrue((feature_extractor_grad_frozen == 0.0).all())
            self.assertTrue((feature_extractor_grad > 0.0).any())

        # ensure that the gradients of all unfrozen layers remain precisely equal, i.e. all layers excluding the frozen 'feature_extractor'
        grads = tuple(grads[k] for k in grads if "feature_extractor" not in k)
        grads_frozen = tuple(grads_frozen[k] for k in grads_frozen if "feature_extractor" not in k)

        for grad, grad_frozen in zip(grads, grads_frozen):
            self.assertTrue((grad == grad_frozen).all())
def convert_wav2vec2_checkpoint(
    checkpoint_path,
    pytorch_dump_folder_path,
    dict_path,
    encoder_config_path,
    decoder_config_path,
    vocab_size,
    num_decoder_layers,
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    encoder_config = Wav2Vec2Config.from_pretrained(encoder_config_path)
    decoder_config = Speech2Text2Config.from_pretrained(
        decoder_config_path, vocab_size=vocab_size, decoder_layers=num_decoder_layers, do_stable_layer_norm=True
    )

    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16000,
        padding_value=0,
        do_normalize=True,
        return_attention_mask=True,
    )

    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
    )
    model = model[0].eval()

    # set weights for wav2vec2 encoder
    hf_encoder = Wav2Vec2Model(encoder_config)
    projection_layer = recursively_load_weights_wav2vec2(model.encoder, hf_encoder)

    hf_decoder = Speech2Text2ForCausalLM(decoder_config)
    missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False)

    # set output linear layer
    unexpected_keys.remove("embed_out")
    hf_decoder.lm_head.weight = nn.Parameter(model.decoder.embed_out.detach())

    # layer norm is init to identity matrix so leaving it is fine
    logger.warning(f"The following keys are missing when loading the decoder weights: {missing_keys}")
    logger.warning(f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}")

    hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder)
    hf_wav2vec.config.tie_word_embeddings = False

    # add projection layer
    hf_wav2vec.enc_to_dec_proj.weight = nn.Parameter(projection_layer.weight)
    hf_wav2vec.enc_to_dec_proj.bias = nn.Parameter(projection_layer.bias)

    vocab_dict = create_vocab_dict(dict_path)

    with open(os.path.join(pytorch_dump_folder_path, "vocab.json"), "w") as fp:
        json.dump(vocab_dict, fp)

    tokenizer = Speech2Text2Tokenizer(os.path.join(pytorch_dump_folder_path, "vocab.json"))
    tokenizer.save_pretrained(pytorch_dump_folder_path)

    config = hf_wav2vec.config.to_dict()
    config["pad_token_id"] = tokenizer.pad_token_id
    config["bos_token_id"] = tokenizer.bos_token_id
    config["eos_token_id"] = tokenizer.eos_token_id
    config["tokenizer_class"] = "speech_to_text_2"
    config["feature_extractor_type"] = "wav2vec2"

    hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)

    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
def convert_wav2vec2_checkpoint(
    checkpoint_path,
    pytorch_dump_folder_path,
    dict_path,
    config_yaml_path,
    encoder_config_path,
    decoder_config_path,
    add_adapter,
    adapter_kernel_size,
    adapter_stride,
    decoder_start_token_id,
    encoder_output_dim,
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # load configs
    encoder_config = Wav2Vec2Config.from_pretrained(
        encoder_config_path,
        add_adapter=True,
        adapter_stride=adapter_stride,
        adapter_kernel_size=adapter_kernel_size,
        use_auth_token=True,
        output_hidden_size=encoder_output_dim,
    )
    decoder_config = MBartConfig.from_pretrained(decoder_config_path)

    # load model
    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        [checkpoint_path],
        arg_overrides={
            "config_yaml": config_yaml_path,
            "data": "/".join(dict_path.split("/")[:-1]),
            "w2v_path": checkpoint_path,
            "load_pretrained_decoder_from": None,
        },
    )
    model = model[0].eval()

    # load feature extractor
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        encoder_config_path, use_auth_token=True)

    # set weights for wav2vec2 encoder
    hf_encoder = Wav2Vec2Model(encoder_config)

    recursively_load_weights_wav2vec2(model.encoder, hf_encoder)

    # load decoder weights
    hf_decoder = MBartForCausalLM(decoder_config)
    missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(
        model.decoder.state_dict(), strict=False)
    logger.warning(
        f"The following keys are missing when loading the decoder weights: {missing_keys}"
    )
    logger.warning(
        f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}"
    )

    hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder,
                                           decoder=hf_decoder)
    hf_wav2vec.config.tie_word_embeddings = False

    tokenizer = MBart50Tokenizer(dict_path)
    tokenizer.save_pretrained(pytorch_dump_folder_path)

    config = hf_wav2vec.config.to_dict()
    config["pad_token_id"] = tokenizer.pad_token_id
    config["bos_token_id"] = tokenizer.bos_token_id
    config["eos_token_id"] = tokenizer.eos_token_id
    config["tokenizer_class"] = "mbart50"
    config["feature_extractor_type"] = "wav2vec2"

    config["decoder_start_token_id"] = tokenizer.eos_token_id
    config["forced_bos_token_id"] = 250004
    config["forced_eos_token_id"] = tokenizer.eos_token_id

    hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)

    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
    feature_extractor.save_pretrained(pytorch_dump_folder_path)