def check_encoder_decoder_model_generate(self, config, decoder_config, input_values=None, input_features=None, **kwargs): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) enc_dec_model.to(torch_device) # make sure EOS token is set to None to prevent early stopping of generation enc_dec_model.config.eos_token_id = None if hasattr(enc_dec_model.config, "decoder") and hasattr( enc_dec_model.config.decoder, "eos_token_id"): enc_dec_model.config.decoder.eos_token_id = None inputs = input_values if input_features is None else input_features # Bert does not have a bos token id, so use pad_token_id instead generated_output = enc_dec_model.generate( inputs, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id) self.assertEqual(generated_output.shape, (inputs.shape[0], ) + (decoder_config.max_length, ))
def check_encoder_decoder_model_output_attentions(self, config, attention_mask, decoder_config, decoder_input_ids, decoder_attention_mask, labels=None, input_values=None, input_features=None, **kwargs): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] decoder_attention_mask = decoder_attention_mask[:, :-1] encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) enc_dec_model.to(torch_device) outputs_encoder_decoder = enc_dec_model( input_values=input_values, input_features=input_features, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, output_attentions=True, ) inputs = input_values if input_features is None else input_features encoder_attentions = outputs_encoder_decoder["encoder_attentions"] self.assertEqual(len(encoder_attentions), config.num_hidden_layers) seq_len = enc_dec_model.encoder._get_feat_extract_output_lengths( inputs.shape[1]) self.assertEqual(encoder_attentions[0].shape[-3:], (config.num_attention_heads, seq_len, seq_len)) decoder_attentions = outputs_encoder_decoder["decoder_attentions"] num_decoder_layers = (decoder_config.num_decoder_layers if hasattr( decoder_config, "num_decoder_layers") else decoder_config.num_hidden_layers) self.assertEqual(len(decoder_attentions), num_decoder_layers) self.assertEqual( decoder_attentions[0].shape[-3:], (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]), ) cross_attentions = outputs_encoder_decoder["cross_attentions"] self.assertEqual(len(cross_attentions), num_decoder_layers) cross_attention_input_seq_len = decoder_input_ids.shape[-1] self.assertEqual( cross_attentions[0].shape[-3:], (decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len), )
def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict): encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = SpeechEncoderDecoderModel(encoder_decoder_config) fx_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config) fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model) fx_model.params = fx_state self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
def check_encoder_decoder_model_generate( self, config, decoder_config, input_values=None, input_features=None, **kwargs ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) enc_dec_model.to(torch_device) inputs = input_values if input_features is None else input_features # Bert does not have a bos token id, so use pad_token_id instead generated_output = enc_dec_model.generate( inputs, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id ) self.assertEqual(generated_output.shape, (inputs.shape[0],) + (decoder_config.max_length,))
def check_encoder_decoder_model_from_pretrained(self, config, attention_mask, decoder_config, decoder_input_ids, decoder_attention_mask, return_dict, input_values=None, input_features=None, **kwargs): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) kwargs = { "encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict } enc_dec_model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( **kwargs) enc_dec_model.to(torch_device) outputs_encoder_decoder = enc_dec_model( input_values=input_values, input_features=input_features, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, output_hidden_states=True, return_dict=True, ) self.assertEqual(outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, )))
def test_real_model_save_load_from_pretrained(self): model_2 = self.get_pretrained_model() model_2.to(torch_device) input_name, inputs = self.get_inputs() decoder_input_ids = ids_tensor([13, 1], model_2.config.encoder.vocab_size) attention_mask = ids_tensor([13, 5], vocab_size=2) with torch.no_grad(): outputs = model_2( **{input_name: inputs}, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, ) out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as tmp_dirname: model_2.save_pretrained(tmp_dirname) model_1 = SpeechEncoderDecoderModel.from_pretrained(tmp_dirname) model_1.to(torch_device) after_outputs = model_1( **{input_name: inputs}, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, ) out_1 = after_outputs[0].cpu().numpy() out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict): pt_model.to(torch_device) pt_model.eval() # prepare inputs flax_inputs = inputs_dict pt_inputs = { k: torch.tensor(v.tolist()) for k, v in flax_inputs.items() } with torch.no_grad(): pt_outputs = pt_model(**pt_inputs) pt_logits = pt_outputs.logits pt_outputs = pt_outputs.to_tuple() fx_outputs = fx_model(**inputs_dict) fx_logits = fx_outputs.logits fx_outputs = fx_outputs.to_tuple() self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") self.assert_almost_equals(fx_logits, pt_logits.numpy(), 4e-2) # PT -> Flax with tempfile.TemporaryDirectory() as tmpdirname: pt_model.save_pretrained(tmpdirname) fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained( tmpdirname, from_pt=True) fx_outputs_loaded = fx_model_loaded(**inputs_dict) fx_logits_loaded = fx_outputs_loaded.logits fx_outputs_loaded = fx_outputs_loaded.to_tuple() self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch") self.assert_almost_equals(fx_logits_loaded, pt_logits.numpy(), 4e-2) # Flax -> PT with tempfile.TemporaryDirectory() as tmpdirname: fx_model.save_pretrained(tmpdirname) pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained( tmpdirname, from_flax=True) pt_model_loaded.to(torch_device) pt_model_loaded.eval() with torch.no_grad(): pt_outputs_loaded = pt_model_loaded(**pt_inputs) pt_logits_loaded = pt_outputs_loaded.logits pt_outputs_loaded = pt_outputs_loaded.to_tuple() self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch") self.assert_almost_equals(fx_logits, pt_logits_loaded.numpy(), 4e-2)
def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict): encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = SpeechEncoderDecoderModel(encoder_decoder_config) fx_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config) pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params) self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
def check_encoder_decoder_model(self, config, attention_mask, decoder_config, decoder_input_ids, decoder_attention_mask, input_values=None, input_features=None, **kwargs): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) self.assertTrue(enc_dec_model.config.decoder.is_decoder) self.assertTrue(enc_dec_model.config.decoder.add_cross_attention) self.assertTrue(enc_dec_model.config.is_encoder_decoder) enc_dec_model.to(torch_device) outputs_encoder_decoder = enc_dec_model( input_values=input_values, input_features=input_features, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, output_hidden_states=True, ) self.assertEqual(outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, ))) encoder_outputs = BaseModelOutput( last_hidden_state=outputs_encoder_decoder.encoder_hidden_states[-1] ) outputs_encoder_decoder = enc_dec_model( encoder_outputs=encoder_outputs, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) self.assertEqual(outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, )))
def check_encoder_decoder_model_from_pretrained_configs( self, config, attention_mask, decoder_config, decoder_input_ids, decoder_attention_mask, input_values=None, input_features=None, **kwargs): encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs( config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) enc_dec_model = SpeechEncoderDecoderModel(encoder_decoder_config) enc_dec_model.to(torch_device) enc_dec_model.eval() self.assertTrue(enc_dec_model.config.is_encoder_decoder) outputs_encoder_decoder = enc_dec_model( input_values=input_values, input_features=input_features, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) self.assertEqual(outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, )))
def check_save_and_load_encoder_decoder_model(self, config, attention_mask, decoder_config, decoder_input_ids, decoder_attention_mask, input_values=None, input_features=None, **kwargs): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) enc_dec_model.to(torch_device) enc_dec_model.eval() with torch.no_grad(): outputs = enc_dec_model( input_values=input_values, input_features=input_features, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory( ) as encoder_tmp_dirname, tempfile.TemporaryDirectory( ) as decoder_tmp_dirname: enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname) enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname) SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path=encoder_tmp_dirname, decoder_pretrained_model_name_or_path=decoder_tmp_dirname, ) after_outputs = enc_dec_model( input_values=input_values, input_features=input_features, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) out_1 = after_outputs[0].cpu().numpy() out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def check_encoder_decoder_model_with_inputs(self, config, attention_mask, decoder_config, decoder_input_ids, decoder_attention_mask, input_values=None, input_features=None, **kwargs): inputs = input_values if input_features is None else input_features encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) enc_dec_model.to(torch_device) outputs_encoder_decoder = enc_dec_model( inputs, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, output_hidden_states=True, ) self.assertEqual(outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, ))) outputs_encoder_decoder_kwarg = enc_dec_model( inputs=inputs, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, output_hidden_states=True, ) self.assertEqual(outputs_encoder_decoder_kwarg["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, )))
def get_pretrained_model_and_inputs(self): model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( "facebook/s2t-small-librispeech-asr", "bert-base-cased" ) batch_size = 13 input_features = floats_tensor([batch_size, 7, 80], scale=1.0) attention_mask = random_attention_mask([batch_size, 7]) decoder_input_ids = ids_tensor([batch_size, 4], model.decoder.config.vocab_size) decoder_attention_mask = random_attention_mask([batch_size, 4]) inputs = { "input_features": input_features, "attention_mask": attention_mask, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask, } return model, inputs
def get_pretrained_model_and_inputs(self): model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( "facebook/wav2vec2-base-960h", "bert-base-cased" ) batch_size = 13 input_values = floats_tensor([batch_size, 512], scale=1.0) attention_mask = random_attention_mask([batch_size, 512]) decoder_input_ids = ids_tensor([batch_size, 4], model.decoder.config.vocab_size) decoder_attention_mask = random_attention_mask([batch_size, 4]) inputs = { "input_values": input_values, "attention_mask": attention_mask, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask, } return model, inputs
def test_real_model_save_load_from_pretrained(self): model_2, inputs = self.get_pretrained_model_and_inputs() model_2.to(torch_device) with torch.no_grad(): outputs = model_2(**inputs) out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as tmp_dirname: model_2.save_pretrained(tmp_dirname) model_1 = SpeechEncoderDecoderModel.from_pretrained(tmp_dirname) model_1.to(torch_device) after_outputs = model_1(**inputs) out_1 = after_outputs[0].cpu().numpy() out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def convert_wav2vec2_checkpoint( checkpoint_path, pytorch_dump_folder_path, dict_path, encoder_config_path, decoder_config_path, vocab_size, num_decoder_layers, ): """ Copy/paste/tweak model's weights to transformers design. """ encoder_config = Wav2Vec2Config.from_pretrained(encoder_config_path) decoder_config = Speech2Text2Config.from_pretrained( decoder_config_path, vocab_size=vocab_size, decoder_layers=num_decoder_layers, do_stable_layer_norm=True ) feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0, do_normalize=True, return_attention_mask=True, ) model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])} ) model = model[0].eval() # set weights for wav2vec2 encoder hf_encoder = Wav2Vec2Model(encoder_config) projection_layer = recursively_load_weights_wav2vec2(model.encoder, hf_encoder) hf_decoder = Speech2Text2ForCausalLM(decoder_config) missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False) # set output linear layer unexpected_keys.remove("embed_out") hf_decoder.lm_head.weight = nn.Parameter(model.decoder.embed_out.detach()) # layer norm is init to identity matrix so leaving it is fine logger.warning(f"The following keys are missing when loading the decoder weights: {missing_keys}") logger.warning(f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}") hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder) hf_wav2vec.config.tie_word_embeddings = False # add projection layer hf_wav2vec.enc_to_dec_proj.weight = nn.Parameter(projection_layer.weight) hf_wav2vec.enc_to_dec_proj.bias = nn.Parameter(projection_layer.bias) vocab_dict = create_vocab_dict(dict_path) with open(os.path.join(pytorch_dump_folder_path, "vocab.json"), "w") as fp: json.dump(vocab_dict, fp) tokenizer = Speech2Text2Tokenizer(os.path.join(pytorch_dump_folder_path, "vocab.json")) tokenizer.save_pretrained(pytorch_dump_folder_path) config = hf_wav2vec.config.to_dict() config["pad_token_id"] = tokenizer.pad_token_id config["bos_token_id"] = tokenizer.bos_token_id config["eos_token_id"] = tokenizer.eos_token_id config["tokenizer_class"] = "speech_to_text_2" config["feature_extractor_type"] = "wav2vec2" hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config) hf_wav2vec.save_pretrained(pytorch_dump_folder_path) feature_extractor.save_pretrained(pytorch_dump_folder_path)
def get_pretrained_model(self): return SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( "facebook/wav2vec2-base-960h", "bert-base-cased" )
def get_pretrained_model(self): return SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( "facebook/s2t-small-librispeech-asr", "bert-base-cased" )
def test_flaxwav2vec2bart_pt_flax_equivalence(self): pt_model = SpeechEncoderDecoderModel.from_pretrained("patrickvonplaten/wav2vec2-2-bart-large") fx_model = FlaxSpeechEncoderDecoderModel.from_pretrained( "patrickvonplaten/wav2vec2-2-bart-large", from_pt=True ) pt_model.to(torch_device) pt_model.eval() # prepare inputs batch_size = 13 input_values = floats_tensor([batch_size, 512], fx_model.config.encoder.vocab_size) attention_mask = random_attention_mask([batch_size, 512]) decoder_input_ids = ids_tensor([batch_size, 4], fx_model.config.decoder.vocab_size) decoder_attention_mask = random_attention_mask([batch_size, 4]) inputs_dict = { "inputs": input_values, "attention_mask": attention_mask, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask, } flax_inputs = inputs_dict pt_inputs = {k: torch.tensor(v.tolist()) for k, v in flax_inputs.items()} with torch.no_grad(): pt_outputs = pt_model(**pt_inputs) pt_logits = pt_outputs.logits pt_outputs = pt_outputs.to_tuple() fx_outputs = fx_model(**inputs_dict) fx_logits = fx_outputs.logits fx_outputs = fx_outputs.to_tuple() self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") self.assert_almost_equals(fx_logits, pt_logits.numpy(), 4e-2) # PT -> Flax with tempfile.TemporaryDirectory() as tmpdirname: pt_model.save_pretrained(tmpdirname) fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True) fx_outputs_loaded = fx_model_loaded(**inputs_dict) fx_logits_loaded = fx_outputs_loaded.logits fx_outputs_loaded = fx_outputs_loaded.to_tuple() self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch") self.assert_almost_equals(fx_logits_loaded, pt_logits.numpy(), 4e-2) # Flax -> PT with tempfile.TemporaryDirectory() as tmpdirname: fx_model.save_pretrained(tmpdirname) pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True) pt_model_loaded.to(torch_device) pt_model_loaded.eval() with torch.no_grad(): pt_outputs_loaded = pt_model_loaded(**pt_inputs) pt_logits_loaded = pt_outputs_loaded.logits pt_outputs_loaded = pt_outputs_loaded.to_tuple() self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch") self.assert_almost_equals(fx_logits, pt_logits_loaded.numpy(), 4e-2)
def get_pretrained_model(self): return SpeechEncoderDecoderModel.from_encoder_decoder_pretrained("bert-large-uncased", "facebook/bart-large")
def convert_wav2vec2_checkpoint( checkpoint_path, pytorch_dump_folder_path, dict_path, config_yaml_path, encoder_config_path, decoder_config_path, add_adapter, adapter_kernel_size, adapter_stride, decoder_start_token_id, encoder_output_dim, ): """ Copy/paste/tweak model's weights to transformers design. """ # load configs encoder_config = Wav2Vec2Config.from_pretrained( encoder_config_path, add_adapter=True, adapter_stride=adapter_stride, adapter_kernel_size=adapter_kernel_size, use_auth_token=True, output_hidden_size=encoder_output_dim, ) decoder_config = MBartConfig.from_pretrained(decoder_config_path) # load model model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={ "config_yaml": config_yaml_path, "data": "/".join(dict_path.split("/")[:-1]), "w2v_path": checkpoint_path, "load_pretrained_decoder_from": None, }, ) model = model[0].eval() # load feature extractor feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( encoder_config_path, use_auth_token=True) # set weights for wav2vec2 encoder hf_encoder = Wav2Vec2Model(encoder_config) recursively_load_weights_wav2vec2(model.encoder, hf_encoder) # load decoder weights hf_decoder = MBartForCausalLM(decoder_config) missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict( model.decoder.state_dict(), strict=False) logger.warning( f"The following keys are missing when loading the decoder weights: {missing_keys}" ) logger.warning( f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}" ) hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder) hf_wav2vec.config.tie_word_embeddings = False tokenizer = MBart50Tokenizer(dict_path) tokenizer.save_pretrained(pytorch_dump_folder_path) config = hf_wav2vec.config.to_dict() config["pad_token_id"] = tokenizer.pad_token_id config["bos_token_id"] = tokenizer.bos_token_id config["eos_token_id"] = tokenizer.eos_token_id config["tokenizer_class"] = "mbart50" config["feature_extractor_type"] = "wav2vec2" config["decoder_start_token_id"] = tokenizer.eos_token_id config["forced_bos_token_id"] = 250004 config["forced_eos_token_id"] = tokenizer.eos_token_id hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config) hf_wav2vec.save_pretrained(pytorch_dump_folder_path) feature_extractor.save_pretrained(pytorch_dump_folder_path)