def build_model( cfg: dict, sgn_dim: int, features_dim: int, gls_vocab: GlossVocabulary, txt_vocab: TextVocabulary, do_recognition: bool = True, do_translation: bool = True, ) -> SignModel: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param sgn_dim: feature dimension of the sign frame representation, i.e. 2560 for EfficientNet-7. :param gls_vocab: sign gloss vocabulary :param txt_vocab: spoken language word vocabulary :return: built and initialized model :param do_recognition: flag to build the model with recognition output. :param do_translation: flag to build the model with translation decoder. """ txt_padding_idx = txt_vocab.stoi[PAD_TOKEN] sgn_embed: SpatialEmbeddings = SpatialEmbeddings( **cfg["encoder"]["embeddings"], num_heads=cfg["encoder"]["num_heads"], input_size=sgn_dim, ) features_embed: SpatialEmbeddings = SpatialEmbeddings( embedding_dim=512, scale=False, dropout=0.1, norm_type='batch', activation_type='softsign', num_heads=cfg["encoder"]["num_heads"], input_size=features_dim, ) # build encoder enc_dropout = cfg["encoder"].get("dropout", 0.0) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert ( cfg["encoder"]["embeddings"]["embedding_dim"] == cfg["encoder"] ["hidden_size"]), "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder( **cfg["encoder"], emb_size=cfg["encoder"]["embeddings"]["embedding_dim"], emb_dropout=enc_emb_dropout, ) else: encoder = RecurrentEncoder( **cfg["encoder"], emb_size=sgn_embed.embedding_dim, emb_dropout=enc_emb_dropout, ) if do_recognition: gloss_output_layer = nn.Linear(encoder.output_size, len(gls_vocab)) if cfg["encoder"].get("freeze", False): freeze_params(gloss_output_layer) else: gloss_output_layer = None # build decoder and word embeddings if do_translation: txt_embed: Union[Embeddings, None] = Embeddings( **cfg["decoder"]["embeddings"], num_heads=cfg["decoder"]["num_heads"], vocab_size=len(txt_vocab), padding_idx=txt_padding_idx, ) dec_dropout = cfg["decoder"].get("dropout", 0.0) dec_emb_dropout = cfg["decoder"]["embeddings"].get( "dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: decoder = RecurrentDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: txt_embed = None decoder = None model: SignModel = SignModel( encoder=encoder, gloss_output_layer=gloss_output_layer, decoder=decoder, sgn_embed=sgn_embed, features_embed=features_embed, txt_embed=txt_embed, gls_vocab=gls_vocab, txt_vocab=txt_vocab, do_recognition=do_recognition, do_translation=do_translation, ) if do_translation: # tie softmax layer with txt embeddings if cfg.get("tied_softmax", False): # noinspection PyUnresolvedReferences if txt_embed.lut.weight.shape == model.decoder.output_layer.weight.shape: # (also) share txt embeddings and softmax layer: # noinspection PyUnresolvedReferences model.decoder.output_layer.weight = txt_embed.lut.weight else: raise ValueError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters initialize_model(model, cfg, txt_padding_idx) return model
def build_pose_model( cfg: dict, sgn_dim: int, gls_vocab: GlossVocabulary, txt_vocab: TextVocabulary, do_recognition: bool = True, do_translation: bool = True, ) -> PoseModel: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param sgn_dim: feature dimension of the sign frame representation, i.e. 2560 for EfficientNet-7. :param gls_vocab: sign gloss vocabulary :param txt_vocab: spoken language word vocabulary :return: built and initialized model :param do_recognition: flag to build the model with recognition output. :param do_translation: flag to build the model with translation decoder. """ txt_padding_idx = txt_vocab.stoi[PAD_TOKEN] pose_estimation_fields = [('body_2d', 13 * 2), ('body_3d', 13 * 3), ('face_2d', 84 * 2), ('face_3d', 84 * 3), ('left_hand_2d', 21 * 2), ('left_hand_3d', 21 * 3), ('right_hand_2d', 21 * 2), ('right_hand_3d', 21 * 3)] new_embedding_config = { 'embedding_dim': 64, # is it good? 'scale': False, 'dropout': 0.1, 'norm_type': 'batch', 'activation_type': 'softsign' } embedding_list = [] for field in pose_estimation_fields: new_embedding = SpatialEmbeddings( **new_embedding_config, num_heads=cfg["encoder"]["num_heads"], input_size=field[1], ) embedding_list.append(new_embedding) # build encoder new_hidden_size = new_embedding_config['embedding_dim'] * len( pose_estimation_fields) new_encoder_config = { 'type': 'transformer', 'num_layers': 3, 'num_heads': 8, 'hidden_size': new_hidden_size, 'ff_size': 2048, 'dropout': 0.1 } enc_dropout = new_encoder_config.get("dropout", 0.0) enc_emb_dropout = enc_dropout if cfg["encoder"].get("type", "recurrent") == "transformer": # assert ( # cfg["encoder"]["embeddings"]["embedding_dim"] # == cfg["encoder"]["hidden_size"] # ), "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder( **new_encoder_config, # emb_size=sgn_embed.embedding_dim, emb_dropout=enc_emb_dropout, ) else: # code not adapted yet encoder = RecurrentEncoder( **cfg["encoder"], emb_size=sgn_embed.embedding_dim, emb_dropout=enc_emb_dropout, ) if do_recognition: gloss_output_layer = nn.Linear(encoder.output_size, len(gls_vocab)) if cfg["encoder"].get("freeze", False): freeze_params(gloss_output_layer) else: gloss_output_layer = None # build decoder and word embeddings if do_translation: txt_embed: Union[Embeddings, None] = Embeddings( **cfg["decoder"]["embeddings"], num_heads=cfg["decoder"]["num_heads"], vocab_size=len(txt_vocab), padding_idx=txt_padding_idx, ) dec_dropout = cfg["decoder"].get("dropout", 0.0) dec_emb_dropout = cfg["decoder"]["embeddings"].get( "dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: decoder = RecurrentDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: txt_embed = None decoder = None model: PoseModel = PoseModel( encoder=encoder, gloss_output_layer=gloss_output_layer, decoder=decoder, pose_embed=embedding_list, txt_embed=txt_embed, gls_vocab=gls_vocab, txt_vocab=txt_vocab, do_recognition=do_recognition, do_translation=do_translation, ) if do_translation: # tie softmax layer with txt embeddings if cfg.get("tied_softmax", False): # noinspection PyUnresolvedReferences if txt_embed.lut.weight.shape == model.decoder.output_layer.weight.shape: # (also) share txt embeddings and softmax layer: # noinspection PyUnresolvedReferences model.decoder.output_layer.weight = txt_embed.lut.weight else: raise ValueError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters initialize_model(model, cfg, txt_padding_idx) return model