def __init__(self, fusion_type: str, num_layers: int = 4, num_heads: int = 8, hidden_size: int = 512, ff_size: int = 2048, dropout: float = 0.1, emb_dropout: float = 0.1, vocab_size: int = 1, freeze: bool = False, **kwargs): """ Initialize a Transformer decoder. :param num_layers: number of Transformer layers :param num_heads: number of heads for each layer :param hidden_size: hidden size :param ff_size: position-wise feed-forward size :param dropout: dropout probability (1-keep) :param emb_dropout: dropout probability for embeddings :param vocab_size: size of the output vocabulary :param freeze: set to True keep all decoder parameters fixed :param kwargs: """ super(TransformerDecoder, self).__init__() self._hidden_size = hidden_size self._output_size = vocab_size # create num_layers decoder layers and put them in a list self.layers = nn.ModuleList([ TransformerDecoderLayer( size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout, ) for _ in range(num_layers) ]) self.pe = PositionalEncoding(hidden_size) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.emb_dropout = nn.Dropout(p=emb_dropout) if fusion_type != 'late_fusion': self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False) else: self.output_layer = nn.Linear(2 * hidden_size, vocab_size, bias=False) if freeze: freeze_params(self)
def __init__( self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, emb_dropout: float = 0.1, freeze: bool = False, **kwargs ): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability for Transformer layers :param emb_dropout: Is applied to the input (word embeddings). :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(TransformerEncoder, self).__init__() # build all (num_layers) layers self.layers = nn.ModuleList( [ TransformerEncoderLayer( size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout, ) for _ in range(num_layers) ] ) self.layer_norm2 = nn.Sequential( nn.Linear(hidden_size*2, hidden_size), nn.ReLU(), nn.LayerNorm(hidden_size, eps=1e-6)) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.pe = PositionalEncoding(hidden_size) self.emb_dropout = nn.Dropout(p=emb_dropout) self._output_size = hidden_size if freeze: freeze_params(self)
def __init__(self, embedding_dim: int = 64, num_heads: int = 8, scale: bool = False, scale_factor: float = None, norm_type: str = None, activation_type: str = None, vocab_size: int = 0, padding_idx: int = 1, freeze: bool = False, **kwargs): """ Create new embeddings for the vocabulary. Use scaling for the Transformer. :param embedding_dim: :param scale: :param vocab_size: :param padding_idx: :param freeze: freeze the embeddings during training """ super().__init__() self.embedding_dim = embedding_dim self.vocab_size = vocab_size self.lut = nn.Embedding(vocab_size, self.embedding_dim, padding_idx=padding_idx) self.norm_type = norm_type if self.norm_type: self.norm = MaskedNorm(norm_type=norm_type, num_groups=num_heads, num_features=embedding_dim) self.activation_type = activation_type if self.activation_type: self.activation = get_activation(activation_type) self.scale = scale if self.scale: if scale_factor: self.scale_factor = scale_factor else: self.scale_factor = math.sqrt(self.embedding_dim) if freeze: freeze_params(self)
def __init__( self, rnn_type: str = "gru", hidden_size: int = 1, emb_size: int = 1, num_layers: int = 1, dropout: float = 0.0, emb_dropout: float = 0.0, bidirectional: bool = True, freeze: bool = False, **kwargs ) -> None: """ Create a new recurrent encoder. :param rnn_type: RNN type: `gru` or `lstm`. :param hidden_size: Size of each RNN. :param emb_size: Size of the word embeddings. :param num_layers: Number of encoder RNN layers. :param dropout: Is applied between RNN layers. :param emb_dropout: Is applied to the RNN input (word embeddings). :param bidirectional: Use a bi-directional RNN. :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(RecurrentEncoder, self).__init__() self.emb_dropout = torch.nn.Dropout(p=emb_dropout, inplace=False) self.type = rnn_type self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.rnn = rnn( emb_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0.0, ) self._output_size = 2 * hidden_size if bidirectional else hidden_size if freeze: freeze_params(self)
def __init__(self, embedding_dim: int, input_size: int, num_heads: int, freeze: bool = False, norm_type: str = None, activation_type: str = None, scale: bool = False, scale_factor: float = None, **kwargs): """ Create new embeddings for the vocabulary. Use scaling for the Transformer. :param embedding_dim: :param input_size: :param freeze: freeze the embeddings during training """ super().__init__() self.input_size = input_size self.embedding_dim = embedding_dim self.ln = nn.Linear(self.input_size, self.embedding_dim) self.ln_pose = nn.Linear(2 * 84 + 2 * 21 + 2 * 13, self.embedding_dim) self.norm_type = norm_type if self.norm_type: self.norm = MaskedNorm(norm_type=norm_type, num_groups=num_heads, num_features=embedding_dim) self.activation_type = activation_type if self.activation_type: self.activation = get_activation(activation_type) self.scale = scale if self.scale: if scale_factor: self.scale_factor = scale_factor else: self.scale_factor = math.sqrt(self.embedding_dim) if freeze: freeze_params(self)
def __init__(self, rnn_type: str = "gru", emb_size: int = 0, hidden_size: int = 0, encoder: Encoder = None, attention: str = "bahdanau", num_layers: int = 1, vocab_size: int = 0, dropout: float = 0.0, emb_dropout: float = 0.0, hidden_dropout: float = 0.0, init_hidden: str = "bridge", input_feeding: bool = True, freeze: bool = False, **kwargs) -> None: """ Create a recurrent decoder with attention. :param rnn_type: rnn type, valid options: "lstm", "gru" :param emb_size: target embedding size :param hidden_size: size of the RNN :param encoder: encoder connected to this decoder :param attention: type of attention, valid options: "bahdanau", "luong" :param num_layers: number of recurrent layers :param vocab_size: target vocabulary size :param hidden_dropout: Is applied to the input to the attentional layer. :param dropout: Is applied between RNN layers. :param emb_dropout: Is applied to the RNN input (word embeddings). :param init_hidden: If "bridge" (default), the decoder hidden states are initialized from a projection of the last encoder state, if "zeros" they are initialized with zeros, if "last" they are identical to the last encoder state (only if they have the same size) :param input_feeding: Use Luong's input feeding. :param freeze: Freeze the parameters of the decoder during training. :param kwargs: """ super(RecurrentDecoder, self).__init__() self.emb_dropout = torch.nn.Dropout(p=emb_dropout, inplace=False) self.type = rnn_type self.hidden_dropout = torch.nn.Dropout(p=hidden_dropout, inplace=False) self.hidden_size = hidden_size self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.input_feeding = input_feeding if self.input_feeding: # Luong-style # combine embedded prev word +attention vector before feeding to rnn self.rnn_input_size = emb_size + hidden_size else: # just feed prev word embedding self.rnn_input_size = emb_size # the decoder RNN self.rnn = rnn( self.rnn_input_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0, ) # combine output with context vector before output layer (Luong-style) self.att_vector_layer = nn.Linear(hidden_size + encoder.output_size, hidden_size, bias=True) self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False) self._output_size = vocab_size if attention == "bahdanau": self.attention = BahdanauAttention( hidden_size=hidden_size, key_size=encoder.output_size, query_size=hidden_size, ) elif attention == "luong": self.attention = LuongAttention(hidden_size=hidden_size, key_size=encoder.output_size) else: raise ValueError("Unknown attention mechanism: %s. " "Valid options: 'bahdanau', 'luong'." % attention) self.num_layers = num_layers self.hidden_size = hidden_size # to initialize from the final encoder state of last layer self.init_hidden_option = init_hidden if self.init_hidden_option == "bridge": self.bridge_layer = nn.Linear(encoder.output_size, hidden_size, bias=True) elif self.init_hidden_option == "last": if encoder.output_size != self.hidden_size: if encoder.output_size != 2 * self.hidden_size: # bidirectional raise ValueError( "For initializing the decoder state with the " "last encoder state, their sizes have to match " "(encoder: {} vs. decoder: {})".format( encoder.output_size, self.hidden_size)) if freeze: freeze_params(self)
def build_model( cfg: dict, sgn_dim: int, features_dim: int, gls_vocab: GlossVocabulary, txt_vocab: TextVocabulary, do_recognition: bool = True, do_translation: bool = True, ) -> SignModel: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param sgn_dim: feature dimension of the sign frame representation, i.e. 2560 for EfficientNet-7. :param gls_vocab: sign gloss vocabulary :param txt_vocab: spoken language word vocabulary :return: built and initialized model :param do_recognition: flag to build the model with recognition output. :param do_translation: flag to build the model with translation decoder. """ txt_padding_idx = txt_vocab.stoi[PAD_TOKEN] sgn_embed: SpatialEmbeddings = SpatialEmbeddings( **cfg["encoder"]["embeddings"], num_heads=cfg["encoder"]["num_heads"], input_size=sgn_dim, ) features_embed: SpatialEmbeddings = SpatialEmbeddings( embedding_dim=512, scale=False, dropout=0.1, norm_type='batch', activation_type='softsign', num_heads=cfg["encoder"]["num_heads"], input_size=features_dim, ) # build encoder enc_dropout = cfg["encoder"].get("dropout", 0.0) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert ( cfg["encoder"]["embeddings"]["embedding_dim"] == cfg["encoder"] ["hidden_size"]), "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder( **cfg["encoder"], emb_size=cfg["encoder"]["embeddings"]["embedding_dim"], emb_dropout=enc_emb_dropout, ) else: encoder = RecurrentEncoder( **cfg["encoder"], emb_size=sgn_embed.embedding_dim, emb_dropout=enc_emb_dropout, ) if do_recognition: gloss_output_layer = nn.Linear(encoder.output_size, len(gls_vocab)) if cfg["encoder"].get("freeze", False): freeze_params(gloss_output_layer) else: gloss_output_layer = None # build decoder and word embeddings if do_translation: txt_embed: Union[Embeddings, None] = Embeddings( **cfg["decoder"]["embeddings"], num_heads=cfg["decoder"]["num_heads"], vocab_size=len(txt_vocab), padding_idx=txt_padding_idx, ) dec_dropout = cfg["decoder"].get("dropout", 0.0) dec_emb_dropout = cfg["decoder"]["embeddings"].get( "dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: decoder = RecurrentDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: txt_embed = None decoder = None model: SignModel = SignModel( encoder=encoder, gloss_output_layer=gloss_output_layer, decoder=decoder, sgn_embed=sgn_embed, features_embed=features_embed, txt_embed=txt_embed, gls_vocab=gls_vocab, txt_vocab=txt_vocab, do_recognition=do_recognition, do_translation=do_translation, ) if do_translation: # tie softmax layer with txt embeddings if cfg.get("tied_softmax", False): # noinspection PyUnresolvedReferences if txt_embed.lut.weight.shape == model.decoder.output_layer.weight.shape: # (also) share txt embeddings and softmax layer: # noinspection PyUnresolvedReferences model.decoder.output_layer.weight = txt_embed.lut.weight else: raise ValueError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters initialize_model(model, cfg, txt_padding_idx) return model
def build_pose_model( cfg: dict, sgn_dim: int, gls_vocab: GlossVocabulary, txt_vocab: TextVocabulary, do_recognition: bool = True, do_translation: bool = True, ) -> PoseModel: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param sgn_dim: feature dimension of the sign frame representation, i.e. 2560 for EfficientNet-7. :param gls_vocab: sign gloss vocabulary :param txt_vocab: spoken language word vocabulary :return: built and initialized model :param do_recognition: flag to build the model with recognition output. :param do_translation: flag to build the model with translation decoder. """ txt_padding_idx = txt_vocab.stoi[PAD_TOKEN] pose_estimation_fields = [('body_2d', 13 * 2), ('body_3d', 13 * 3), ('face_2d', 84 * 2), ('face_3d', 84 * 3), ('left_hand_2d', 21 * 2), ('left_hand_3d', 21 * 3), ('right_hand_2d', 21 * 2), ('right_hand_3d', 21 * 3)] new_embedding_config = { 'embedding_dim': 64, # is it good? 'scale': False, 'dropout': 0.1, 'norm_type': 'batch', 'activation_type': 'softsign' } embedding_list = [] for field in pose_estimation_fields: new_embedding = SpatialEmbeddings( **new_embedding_config, num_heads=cfg["encoder"]["num_heads"], input_size=field[1], ) embedding_list.append(new_embedding) # build encoder new_hidden_size = new_embedding_config['embedding_dim'] * len( pose_estimation_fields) new_encoder_config = { 'type': 'transformer', 'num_layers': 3, 'num_heads': 8, 'hidden_size': new_hidden_size, 'ff_size': 2048, 'dropout': 0.1 } enc_dropout = new_encoder_config.get("dropout", 0.0) enc_emb_dropout = enc_dropout if cfg["encoder"].get("type", "recurrent") == "transformer": # assert ( # cfg["encoder"]["embeddings"]["embedding_dim"] # == cfg["encoder"]["hidden_size"] # ), "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder( **new_encoder_config, # emb_size=sgn_embed.embedding_dim, emb_dropout=enc_emb_dropout, ) else: # code not adapted yet encoder = RecurrentEncoder( **cfg["encoder"], emb_size=sgn_embed.embedding_dim, emb_dropout=enc_emb_dropout, ) if do_recognition: gloss_output_layer = nn.Linear(encoder.output_size, len(gls_vocab)) if cfg["encoder"].get("freeze", False): freeze_params(gloss_output_layer) else: gloss_output_layer = None # build decoder and word embeddings if do_translation: txt_embed: Union[Embeddings, None] = Embeddings( **cfg["decoder"]["embeddings"], num_heads=cfg["decoder"]["num_heads"], vocab_size=len(txt_vocab), padding_idx=txt_padding_idx, ) dec_dropout = cfg["decoder"].get("dropout", 0.0) dec_emb_dropout = cfg["decoder"]["embeddings"].get( "dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: decoder = RecurrentDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: txt_embed = None decoder = None model: PoseModel = PoseModel( encoder=encoder, gloss_output_layer=gloss_output_layer, decoder=decoder, pose_embed=embedding_list, txt_embed=txt_embed, gls_vocab=gls_vocab, txt_vocab=txt_vocab, do_recognition=do_recognition, do_translation=do_translation, ) if do_translation: # tie softmax layer with txt embeddings if cfg.get("tied_softmax", False): # noinspection PyUnresolvedReferences if txt_embed.lut.weight.shape == model.decoder.output_layer.weight.shape: # (also) share txt embeddings and softmax layer: # noinspection PyUnresolvedReferences model.decoder.output_layer.weight = txt_embed.lut.weight else: raise ValueError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters initialize_model(model, cfg, txt_padding_idx) return model