def get_config(self): return SEWDConfig( hidden_size=self.hidden_size, feat_extract_norm=self.feat_extract_norm, feat_extract_dropout=self.feat_extract_dropout, feat_extract_activation=self.feat_extract_activation, conv_dim=self.conv_dim, conv_stride=self.conv_stride, conv_kernel=self.conv_kernel, conv_bias=self.conv_bias, num_conv_pos_embeddings=self.num_conv_pos_embeddings, num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups, squeeze_factor=self.squeeze_factor, max_position_embeddings=self.max_position_embeddings, position_buckets=self.position_buckets, share_att_key=self.share_att_key, relative_attention=self.relative_attention, position_biased_input=self.position_biased_input, pos_att_type=self.pos_att_type, norm_rel_ebd=self.norm_rel_ebd, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, hidden_dropout=self.hidden_dropout, intermediate_size=self.intermediate_size, layer_norm_eps=self.layer_norm_eps, hidden_act=self.hidden_act, initializer_range=self.initializer_range, vocab_size=self.vocab_size, )
def convert_sew_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True): """ Copy/paste/tweak model's weights to transformers design. """ if is_finetuned: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}) else: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path]) if config_path is not None: config = SEWDConfig.from_pretrained(config_path) else: config = convert_config(model[0]) model = model[0].eval() return_attention_mask = True if config.feat_extract_norm == "layer" else False feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0, do_normalize=True, return_attention_mask=return_attention_mask, ) if is_finetuned: if dict_path: target_dict = Dictionary.load(dict_path) # important change bos & pad token id since CTC symbol is <pad> and # not <s> as in fairseq config.bos_token_id = target_dict.pad_index config.pad_token_id = target_dict.bos_index config.eos_token_id = target_dict.eos_index config.vocab_size = len(target_dict.symbols) vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json") if not os.path.isdir(pytorch_dump_folder_path): logger.error( "--pytorch_dump_folder_path ({}) should be a directory". format(pytorch_dump_folder_path)) return os.makedirs(pytorch_dump_folder_path, exist_ok=True) with open(vocab_path, "w", encoding="utf-8") as vocab_handle: json.dump(target_dict.indices, vocab_handle) tokenizer = Wav2Vec2CTCTokenizer( vocab_path, unk_token=target_dict.unk_word, pad_token=target_dict.pad_word, bos_token=target_dict.bos_word, eos_token=target_dict.eos_word, word_delimiter_token="|", do_lower_case=False, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(pytorch_dump_folder_path) hf_model = SEWDForCTC(config) else: hf_model = SEWDModel(config) feature_extractor.save_pretrained(pytorch_dump_folder_path) recursively_load_weights(model, hf_model, is_finetuned) hf_model.save_pretrained(pytorch_dump_folder_path)
def convert_config(model): config = SEWDConfig() fs_config = model.cfg config.activation_dropout = fs_config.activation_dropout config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0 config.attention_dropout = fs_config.attention_dropout config.conv_bias = fs_config.conv_bias conv_layers = eval(fs_config.conv_feature_layers) config.conv_dim = [x[0] for x in conv_layers] config.conv_kernel = [x[1] for x in conv_layers] config.conv_stride = [x[2] for x in conv_layers] config.feat_extract_activation = "gelu" config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group" config.feat_proj_dropout = fs_config.dropout_input config.final_dropout = 0.0 config.hidden_act = fs_config.activation_fn.name config.hidden_dropout = fs_config.dropout config.hidden_size = fs_config.encoder_embed_dim config.initializer_range = 0.02 config.intermediate_size = fs_config.encoder_ffn_embed_dim config.layer_norm_eps = 1e-5 config.layerdrop = fs_config.encoder_layerdrop config.mask_feature_length = fs_config.mask_channel_length config.mask_feature_prob = fs_config.mask_channel_prob config.mask_time_length = fs_config.mask_length config.mask_time_prob = fs_config.mask_prob config.num_attention_heads = fs_config.encoder_attention_heads config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups config.num_conv_pos_embeddings = fs_config.conv_pos config.num_feat_extract_layers = len(conv_layers) config.num_hidden_layers = fs_config.encoder_layers config.squeeze_factor = fs_config.squeeze_factor # DeBERTa-specific parameters: config.max_position_embeddings = fs_config.max_position_embeddings config.position_buckets = fs_config.position_buckets config.share_att_key = fs_config.share_att_key config.relative_attention = fs_config.relative_attention config.position_biased_input = fs_config.position_biased_input config.pos_att_type = tuple(fs_config.pos_att_type.split("|")) config.norm_rel_ebd = fs_config.norm_rel_ebd return config