def __init__( self, vocab_size: int, hidden_size: int, num_layers: int, inner_size: int, num_attention_heads: int, max_sequence_length: int = 512, num_token_types: int = 2, embedding_dropout: float = 0.0, learn_positional_encodings: bool = False, ffn_dropout: float = 0.0, attn_score_dropout: float = 0.0, attn_layer_dropout: float = 0.0, hidden_act: str = 'relu', mask_future: bool = False, pre_ln: bool = False, pre_ln_final_layer_norm: bool = True, ): super().__init__() self._vocab_size = vocab_size self._hidden_size = hidden_size self._max_sequence_length = max_sequence_length self._embedding = TransformerEmbedding( vocab_size=self._vocab_size, hidden_size=self._hidden_size, max_sequence_length=max_sequence_length, num_token_types=num_token_types, embedding_dropout=embedding_dropout, learn_positional_encodings=learn_positional_encodings, ) self._encoder = TransformerEncoder( hidden_size=self._hidden_size, num_layers=num_layers, inner_size=inner_size, num_attention_heads=num_attention_heads, ffn_dropout=ffn_dropout, attn_score_dropout=attn_score_dropout, attn_layer_dropout=attn_layer_dropout, hidden_act=hidden_act, mask_future=mask_future, pre_ln=pre_ln, pre_ln_final_layer_norm=pre_ln_final_layer_norm, )
def __init__( self, num_layers: int, hidden_size: int, inner_size: int, mask_future: bool = False, num_attention_heads: int = 1, attn_score_dropout: float = 0.0, attn_layer_dropout: float = 0.0, ffn_dropout: float = 0.0, hidden_act: str = "relu", pre_ln: bool = False, pre_ln_final_layer_norm: bool = True, hidden_steps: int = 32, hidden_init_method: str = "default", hidden_blocks: int = 0, ): super().__init__() self._hidden_steps = hidden_steps self._hidden_init_method = hidden_init_method self._hidden_blocks = hidden_blocks if self._hidden_init_method == "default": self._hidden_init_method = "enc_shared" if hidden_blocks < 1: raise ValueError( f"hidden_blocks = {hidden_blocks} but is expected to be >= 1") if self.hidden_init_method not in self.supported_init_methods: raise ValueError( "Unknown hidden_init_method = {hidden_init_method}, supported methods are {supported_init_methods}" .format( hidden_init_method=self.hidden_init_method, supported_init_methods=self.supported_init_methods, )) # attention bridge self.att_bridge = AttentionBridge( hidden_size=hidden_size, k=hidden_steps, bridge_size=inner_size, ) if self.hidden_init_method == "enc": self.init_hidden_enc = TransformerEncoder( num_layers=num_layers, hidden_size=hidden_size, inner_size=inner_size, mask_future=mask_future, num_attention_heads=num_attention_heads, attn_score_dropout=attn_score_dropout, attn_layer_dropout=attn_layer_dropout, ffn_dropout=ffn_dropout, hidden_act=hidden_act, pre_ln=pre_ln, pre_ln_final_layer_norm=pre_ln_final_layer_norm, ) # self attention self.hidden_enc = TransformerEncoder( num_layers=num_layers, hidden_size=hidden_size, inner_size=inner_size, mask_future=mask_future, num_attention_heads=num_attention_heads, attn_score_dropout=attn_score_dropout, attn_layer_dropout=attn_layer_dropout, ffn_dropout=ffn_dropout, hidden_act=hidden_act, pre_ln=pre_ln, pre_ln_final_layer_norm=pre_ln_final_layer_norm, )
def __init__( self, num_layers: int, hidden_size: int, inner_size: int, mask_future: bool = False, num_attention_heads: int = 1, attn_score_dropout: float = 0.0, attn_layer_dropout: float = 0.0, ffn_dropout: float = 0.0, hidden_act: str = "relu", pre_ln: bool = False, pre_ln_final_layer_norm: bool = True, hidden_steps: int = 32, hidden_init_method: str = "default", hidden_blocks: int = 2, ): super().__init__() self._hidden_steps = hidden_steps self._hidden_init_method = hidden_init_method self._hidden_blocks = hidden_blocks if self._hidden_init_method == "default": self._hidden_init_method = "params" if self.hidden_init_method not in self.supported_init_methods: raise ValueError( "Unknown hidden_init_method = {hidden_init_method}, supported methods are {supported_init_methods}".format( hidden_init_method=self.hidden_init_method, supported_init_methods=self.supported_init_methods, ) ) if self.hidden_init_method == "params": # learnable initial hidden values self.init_hidden = torch.nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(hidden_steps, hidden_size))) self.init_cross_att = TransformerDecoder( num_layers=1, hidden_size=hidden_size, inner_size=inner_size, num_attention_heads=num_attention_heads, attn_score_dropout=attn_score_dropout, attn_layer_dropout=attn_layer_dropout, ffn_dropout=ffn_dropout, hidden_act=hidden_act, pre_ln=pre_ln, pre_ln_final_layer_norm=pre_ln_final_layer_norm, ) elif self.hidden_init_method == "bridge": # initialize latent with attention bridge self.att_bridge = AttentionBridge(hidden_size=hidden_size, k=hidden_steps, bridge_size=inner_size,) # cross-attention encoder layer = TransformerDecoder( num_layers=1, hidden_size=hidden_size, inner_size=inner_size, num_attention_heads=num_attention_heads, attn_score_dropout=attn_score_dropout, attn_layer_dropout=attn_layer_dropout, ffn_dropout=ffn_dropout, hidden_act=hidden_act, pre_ln=pre_ln, pre_ln_final_layer_norm=pre_ln_final_layer_norm, ) self.cross_att_layers = torch.nn.ModuleList([copy.deepcopy(layer) for _ in range(hidden_blocks)]) # self-attention encoder layer = TransformerEncoder( num_layers=num_layers, hidden_size=hidden_size, inner_size=inner_size, mask_future=mask_future, num_attention_heads=num_attention_heads, attn_score_dropout=attn_score_dropout, attn_layer_dropout=attn_layer_dropout, ffn_dropout=ffn_dropout, hidden_act=hidden_act, pre_ln=pre_ln, pre_ln_final_layer_norm=pre_ln_final_layer_norm, ) self.self_att_layers = torch.nn.ModuleList([copy.deepcopy(layer) for _ in range(hidden_blocks)])
def __init__( self, num_layers: int, hidden_size: int, inner_size: int, mask_future: bool = False, num_attention_heads: int = 1, attn_score_dropout: float = 0.0, attn_layer_dropout: float = 0.0, ffn_dropout: float = 0.0, hidden_act: str = "relu", pre_ln: bool = False, pre_ln_final_layer_norm: bool = True, hidden_steps: int = 4, hidden_init_method: str = "default", hidden_blocks: int = 2, pooling_type: str = "max", ): super().__init__() # minimal steps to allow reduction self._hidden_steps = hidden_steps self._hidden_init_method = hidden_init_method self._hidden_blocks = hidden_blocks self._pooling_type = pooling_type if self._hidden_steps < 2: raise ValueError( "Expected hidden_steps >= 2 but received hidden_steps = {self._hidden_steps}" ) if self.hidden_init_method not in self.supported_init_methods: raise ValueError( "Unknown hidden_init_method = {hidden_init_method}, supported methods are {supported_init_methods}" .format( hidden_init_method=self.hidden_init_method, supported_init_methods=self.supported_init_methods, )) if self._pooling_type not in self.supported_arch: raise ValueError( f"Unknown pooling_type = {pooling_type}. Available values = {self.supported_arch}" ) # self-attention encoder layer = TransformerEncoder( num_layers=num_layers, hidden_size=hidden_size, inner_size=inner_size, mask_future=mask_future, num_attention_heads=num_attention_heads, attn_score_dropout=attn_score_dropout, attn_layer_dropout=attn_layer_dropout, ffn_dropout=ffn_dropout, hidden_act=hidden_act, pre_ln=pre_ln, pre_ln_final_layer_norm=pre_ln_final_layer_norm, ) self.self_att_layers = torch.nn.ModuleList( [copy.deepcopy(layer) for _ in range(hidden_blocks)]) self.pooling = self._build_pooling_module()