def __init__( self, input_dim: int, # input embedding dimension num_layers: int = 6, num_heads: int = 8, feedforward_hidden_dim: int = None, feedforward_dropout: float = 0.1, attention_dim: int = None, value_dim: int = None, residual_dropout: float = 0.1, attention_dropout: float = 0.1, use_positional_embedding: bool = True, ): super(TransformerEncoder, self).__init__() self._attention_layers: List[MultiHeadSelfAttention] = [] self._attention_norm_layers: List[LayerNorm] = [] self._feedforward_layers: List[FeedForward] = [] self._feedforward_norm_layers: List[LayerNorm] = [] hidden_dim = input_dim attention_dim = attention_dim or (hidden_dim // num_heads) value_dim = value_dim or (hidden_dim // num_heads) feedforward_hidden_dim = feedforward_hidden_dim or hidden_dim for i in range(num_layers): attention = MultiHeadSelfAttention( num_heads, hidden_dim, attention_dim * num_heads, value_dim * num_heads, attention_dropout=attention_dropout) self.add_module(f'attention_{i}', attention) self._attention_layers.append(attention) attention_norm = LayerNorm(hidden_dim) self.add_module(f'attention_norm_{i}', attention_norm) self._attention_norm_layers.append(attention_norm) feedfoward = FeedForward( hidden_dim, num_layers=2, hidden_dims=[feedforward_hidden_dim, hidden_dim], activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], dropout=feedforward_dropout) self.add_module(f"feedforward_{i}", feedfoward) self._feedforward_layers.append(feedfoward) feedforward_norm = LayerNorm(hidden_dim) self.add_module(f"feedforward_norm_{i}", feedforward_norm) self._feedforward_norm_layers.append(feedforward_norm) self._dropout = torch.nn.Dropout(residual_dropout) self.input_dim = input_dim self.hidden_dim = hidden_dim self._use_positional_embedding = use_positional_embedding
def __init__( self, input_dim: int, num_heads: int = 8, attention_dim: Optional[int] = None, value_dim: Optional[int] = None, feedforward_hidden_dim: int = None, residual_dropout: float = 0.1, attention_dropout: float = 0.1, feedforward_dropout: float = 0.1, use_vanilla_wiring: bool = False, ): super(UTDecBlock, self).__init__() hidden_dim = input_dim attention_dim = attention_dim or (hidden_dim // num_heads) value_dim = value_dim or (hidden_dim // num_heads) feedforward_hidden_dim = feedforward_hidden_dim or hidden_dim self._masked_attention = MaskedMultiHeadSelfAttention( num_heads, hidden_dim, attention_dim * num_heads, value_dim * num_heads, attention_dropout=attention_dropout) self._masked_attention_norm = LayerNorm(hidden_dim) self._attention = MultiHeadAttention( num_heads, hidden_dim, hidden_dim, attention_dim * num_heads, value_dim * num_heads, attention_dropout=attention_dropout) self._dropout = torch.nn.Dropout(residual_dropout) self._attention_norm = LayerNorm(hidden_dim) # use feedforward net as transition function self._feedforward = FeedForward( hidden_dim, num_layers=2, hidden_dims=[feedforward_hidden_dim, hidden_dim], activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], dropout=feedforward_dropout) self._feedforward_norm = LayerNorm(hidden_dim) self._use_vanilla_wiring = use_vanilla_wiring
def __init__(self, word_embeddings: TextFieldEmbedder, sentence_encoder: Seq2SeqEncoder, document_encoder: Seq2SeqEncoder, relation_encoder: Seq2SeqEncoder, document_relation_encoder: Seq2SeqEncoder, vocab: Vocabulary, encoder_dropout: float = 0.5, ffn_dropout: float = 0.2) -> None: # We have to pass the vocabulary to the constructor. super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder_dropout = torch.nn.Dropout(p=encoder_dropout) self.sentence_encoder = sentence_encoder self.sentence_attn = LinearAttention( input_dim=self.sentence_encoder.get_output_dim()) self.document_encoder = document_encoder self.document_attn = LinearAttention( input_dim=self.document_encoder.get_output_dim()) self.relation_encoder = relation_encoder self.relation_attn = LinearAttention( input_dim=self.relation_encoder.get_output_dim()) linear_dim = document_encoder.get_output_dim() feedforward_dim = 4 * linear_dim self.ffn = torch.nn.Sequential( torch.nn.Linear(linear_dim, feedforward_dim), torch.nn.ReLU(inplace=True), torch.nn.Dropout(ffn_dropout), torch.nn.Linear(feedforward_dim, linear_dim), torch.nn.Dropout(ffn_dropout)) self.norm = LayerNorm(linear_dim) self.output = torch.nn.Linear(in_features=linear_dim, out_features=1)
def __init__( self, hidden_size: int, num_layers: int = 7, ): super(SLSTMEncoder, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers # h_t updates self.h_context_linearity = torch.nn.Linear(2 * hidden_size, 7 * hidden_size, bias=False) self.h_current_linearity = torch.nn.Linear(hidden_size, 7 * hidden_size, bias=False) self.h_input_linearity = torch.nn.Linear(hidden_size, 7 * hidden_size, bias=True) self.h_global_linearity = torch.nn.Linear(hidden_size, 7 * hidden_size, bias=False) # global updates self.g_input_linearity = torch.nn.Linear(hidden_size, 3 * hidden_size, bias=True) self.g_hidden_linearity = torch.nn.Linear(hidden_size, hidden_size, bias=False) self.g_avg_linearity = torch.nn.Linear(hidden_size, 2 * hidden_size, bias=False) # layer normalization layer self.layer_norms = torch.nn.ModuleList( [LayerNorm(hidden_size) for _ in range(10)]) self.reset_parameters()
def __init__( self, input_dim: int, # input embedding dimension num_layers: int = 6, num_heads: int = 8, feedforward_hidden_dim: int = None, feedforward_dropout: float = 0.1, attention_dim: int = None, value_dim: int = None, residual_dropout: float = 0.1, attention_dropout: float = 0.1, use_positional_embedding: bool = True, ): """ Construct a decoder for transformer, which is in charge of modules in the transformer model from the Positional Embedding before the final linear projection. The embedding and linear projection should be implemented elsewhere. :param num_layers: the number of stack layers of the transformer block """ super(TransformerDecoder, self).__init__() self._mask_attention_layers: List[MaskedMultiHeadSelfAttention] = [] self._mask_attention_norm_layers: List[LayerNorm] = [] self._attention_layers: List[MultiHeadAttention] = [] self._attention_norm_layers: List[LayerNorm] = [] self._feedforward_layers: List[FeedForward] = [] self._feedforward_norm_layers: List[LayerNorm] = [] hidden_dim = input_dim # the hidden states dimension outputted by the decoder module attention_dim = attention_dim or (hidden_dim // num_heads) value_dim = value_dim or (hidden_dim // num_heads) feedforward_hidden_dim = feedforward_hidden_dim or hidden_dim for i in range(num_layers): masked_attention = MaskedMultiHeadSelfAttention( num_heads, hidden_dim, attention_dim * num_heads, value_dim * num_heads, attention_dropout=attention_dropout) self.add_module(f'masked_attention_{i}', masked_attention) self._mask_attention_layers.append(masked_attention) masked_attention_norm = LayerNorm(hidden_dim) self.add_module(f'masked_attention_norm_{i}', masked_attention_norm) self._mask_attention_norm_layers.append(masked_attention_norm) attention = MultiHeadAttention(num_heads, hidden_dim, hidden_dim, attention_dim * num_heads, value_dim * num_heads, attention_dropout=attention_dropout) self.add_module(f'attention_{i}', attention) self._attention_layers.append(attention) attention_norm = LayerNorm(hidden_dim) self.add_module(f'attention_norm_{i}', attention_norm) self._attention_norm_layers.append(attention_norm) feedfoward = FeedForward( hidden_dim, num_layers=2, hidden_dims=[feedforward_hidden_dim, hidden_dim], activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], dropout=feedforward_dropout) self.add_module(f"feedforward_{i}", feedfoward) self._feedforward_layers.append(feedfoward) feedforward_norm = LayerNorm(hidden_dim) self.add_module(f"feedforward_norm_{i}", feedforward_norm) self._feedforward_norm_layers.append(feedforward_norm) self._dropout = torch.nn.Dropout(residual_dropout) self.input_dim = input_dim self.hidden_dim = hidden_dim self._use_positional_embedding = use_positional_embedding
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super().__init__() self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] self._reset_gate_layers: List[FeedForward] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) # Note: Please use `ModuleList` in new code. It provides better # support for running on multiple GPUs. We've kept `add_module` here # solely for backwards compatibility with existing serialized models. self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) reset_gate = FeedForward( feedforward_hidden_dim, activations=Activation.by_name('sigmoid')(), hidden_dims=hidden_dim, num_layers=1, dropout=dropout_prob) self.add_module(f"reset_gate_{i}", reset_gate) self._reset_gate_layers.append(reset_gate) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__(self, size, dropout): super(SublayerConnection, self).__init__() self.norm = LayerNorm(size) self.dropout = nn.Dropout(dropout)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, title_encoder: Seq2VecEncoder, abstract_encoder: Seq2VecEncoder, venue_encoder: Seq2VecEncoder, body_encoder: Seq2VecEncoder = None, predict_mode: bool = False, author_text_embedder: TextFieldEmbedder = None, venue_field_embedder: TextFieldEmbedder = None, author_text_encoder: Seq2VecEncoder = None, # author_id_embedder: Optional[Embedding] = None, author_id_embedder: TextFieldEmbedder = None, # author_position_embedder: Optional[Embedding] = None, author_position_embedder: TextFieldEmbedder = None, feedforward: FeedForward = None, author_feedforward: FeedForward = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, max_num_authors: Optional[int] = 5, dropout: Optional[float] = None, ignore_authors: Optional[bool] = False, layer_norm: Optional[bool] = True, embedding_layer_norm: Optional[bool] = False, loss_distance: Optional[str] = 'l2-norm', loss_margin: Optional[float] = 1, bert_finetune: Optional[bool] = False, include_venue: Optional[bool] = False) -> None: super(Specter, self).__init__(vocab, regularizer) for lbl in range(max_num_authors): vocab.add_token_to_namespace(token=str(lbl), namespace='author_positions') self.text_field_embedder = text_field_embedder self.venue_field_embedder = venue_field_embedder self.title_encoder = title_encoder self.abstract_encoder = abstract_encoder self.body_encoder = body_encoder self.venue_encoder = venue_encoder self.predict_mode = predict_mode self.feedforward = feedforward if loss_distance == 'l2-norm': self.loss = torch.nn.TripletMarginLoss(margin=loss_margin, reduction='none') elif loss_distance == 'binary': self.loss = BinaryLoss(margin=loss_margin) else: self.loss = TripletLoss(margin=loss_margin, distance=loss_distance, reduction='none') if layer_norm: self.layer_norm = LayerNorm(self.feedforward.get_output_dim()) self.do_layer_norm = layer_norm # self.layer_norm_author_embedding = LayerNorm(author_feedforward.get_output_dim()) if embedding_layer_norm: self.layer_norm_word_embedding = LayerNorm( self.title_encoder.get_input_dim()) self.layer_norm_word_embedding_venue = LayerNorm( self.venue_encoder.get_input_dim()) self.embedding_layer_norm = embedding_layer_norm self.dropout = Dropout() self.ignore_authors = ignore_authors if not ignore_authors: self.author_id_embedder = author_id_embedder self.author_position_embedder = author_position_embedder self.author_text_embedder = author_text_embedder self.author_text_encoder = author_text_encoder # author representation would be a concatenation of author-id and author-position # [batch, num-authors, auth-dim + position-dim] # we apply timedistributed mlp on top to make this a: # [batch, num-authors, dim] self.author_time_dist_ff = TimeDistributed(author_feedforward) # internal variable showing that the title/abstract should be encoded with a transformer # do not change this as it should be by default `false` in this class # in the inheriting `PaperRepresentationTransoformer` class it is set to true in the constructor # to indicate that the title/abstract should be encoded with a transformer. self.tansformer_encoder = False self.bert_finetune = bert_finetune self.include_venue = include_venue self.include_venue = include_venue initializer(self)