def convert_unispeech_sat_checkpoint( checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True ): """ Copy/paste/tweak model's weights to transformers design. """ if config_path is not None: config = UniSpeechSatConfig.from_pretrained(config_path) else: config = UniSpeechSatConfig() dict_path = "" if is_finetuned: hf_wav2vec = UniSpeechSatForCTC(config) else: hf_wav2vec = UniSpeechSatForPreTraining(config) model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])} ) model = model[0].eval() recursively_load_weights(model, hf_wav2vec) hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
def get_config(self): return UniSpeechSatConfig( hidden_size=self.hidden_size, feat_extract_norm=self.feat_extract_norm, feat_extract_dropout=self.feat_extract_dropout, feat_extract_activation=self.feat_extract_activation, conv_dim=self.conv_dim, conv_stride=self.conv_stride, conv_kernel=self.conv_kernel, conv_bias=self.conv_bias, num_conv_pos_embeddings=self.num_conv_pos_embeddings, num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups, mask_time_prob=self.mask_time_prob, mask_time_length=self.mask_time_length, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, hidden_dropout_prob=self.hidden_dropout_prob, intermediate_size=self.intermediate_size, layer_norm_eps=self.layer_norm_eps, hidden_act=self.hidden_act, initializer_range=self.initializer_range, vocab_size=self.vocab_size, tdnn_dim=self.tdnn_dim, tdnn_kernel=self.tdnn_kernel, tdnn_dilation=self.tdnn_dilation, xvector_output_dim=self.xvector_output_dim, )
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path): """ Copy/paste/tweak model's weights to transformers design. """ checkpoint = torch.load(checkpoint_path, map_location="cpu") downstream_dict = checkpoint["Downstream"] hf_config = UniSpeechSatConfig.from_pretrained(config_path) hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( base_model_name, return_attention_mask=True, do_normalize=False) arch = hf_config.architectures[0] if arch.endswith("ForSequenceClassification"): hf_model = convert_classification(base_model_name, hf_config, downstream_dict) elif arch.endswith("ForAudioFrameClassification"): hf_model = convert_diarization(base_model_name, hf_config, downstream_dict) elif arch.endswith("ForXVector"): hf_model = convert_xvector(base_model_name, hf_config, downstream_dict) else: raise NotImplementedError( f"S3PRL weights conversion is not supported for {arch}") if hf_config.use_weighted_layer_sum: hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"] hf_feature_extractor.save_pretrained(model_dump_path) hf_model.save_pretrained(model_dump_path)