def __init__(self, vocab_size, n_layers=1, hidden_size=32, embeddings: nn.Embedding=None, use_attention=False, condition_attention=False, tokenwise_attention=False, query_dims=None, bidirectional=False): """ Prepares a GRU encoder for the Intervention, Comparator, or outcome token sequences. Either initializes embedding layer from existing embeddings or creates a random one of size vocab X hidden_size. When using attention we either: * condition on a hidden unit from the encoder and some query vector of size query_dims, which passes a linear combination of the two through a non-linearity (Tanh) and then compresses this to a final number * or we use a linear function from the output of the encoder. In both cases, we use a softmax over the possible outputs to impose a final attention distribution. """ super(LSTMEncoder, self).__init__() if condition_attention and not use_attention: raise ValueError("Cannot condition attention when there is no attention mechanism! Try setting " "use_attention to true or condition_attention to false, ") if tokenwise_attention and not use_attention: raise ValueError("Cannot have element-wise attention when there is no attention mechanism! Try setting " "use_attention to true or condition_attention to false, ") self.vocab_size = vocab_size self.n_layers = n_layers self.use_attention = use_attention self.condition_attention = condition_attention self.tokenwise_attention = tokenwise_attention self.query_dims = query_dims self.bidirectional = bidirectional if self.bidirectional: self.hidden_size = hidden_size // 2 else: self.hidden_size = hidden_size if embeddings is None: self.embedding = nn.Embedding(self.vocab_size, self.hidden_size) self.lstm = nn.LSTM(input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=self.n_layers, batch_first=True, bidirectional=self.bidirectional) else: self.embedding = embeddings self.lstm = nn.LSTM(input_size=embeddings.embedding_dim, hidden_size=self.hidden_size, num_layers=self.n_layers, batch_first=True, bidirectional=self.bidirectional) if self.use_attention: encoding_size = self.hidden_size + int(self.bidirectional) * self.hidden_size self.attention_mechanism = TokenAttention(encoding_size, self.query_dims, condition_attention, tokenwise_attention)
def __init__(self, vocab_size, embeddings: nn.Embedding=None, embedding_dim=200, use_attention=False, condition_attention=False, tokenwise_attention=False, query_dims=None): super(CBoWEncoder, self).__init__() self.vocab_size = vocab_size if embeddings is None: self.embedding = nn.Embedding(vocab_size, embedding_dim) else: self.embedding = embeddings self.embedding_dim = embeddings.embedding_dim self.use_attention = use_attention if self.use_attention: self.attention_mechanism = TokenAttention(self.embedding_dim, self.query_dims, condition_attention, tokenwise_attention)
def __init__(self, vocab_size, embeddings: nn.Embedding = None, use_attention=False, condition_attention=False, tokenwise_attention=False, query_dims=None, N=3, d_model=128, h=4, dropout=0.1, concat_relay=False): super(StarTransformerEncoder, self).__init__() self.d_model = d_model # hidden dims for transformer # the use_attention flag determines whether we impose attention over # *tokens*, which is independent of the self-attention mechanism # used by the transformer self.use_attention = use_attention self.condition_attention = condition_attention self.query_dims = query_dims attention_input_dims = self.d_model # if this is true, then we concatenate the relay node to all token embeddings self.concat_relay = concat_relay if self.concat_relay: attention_input_dims = attention_input_dims + self.d_model if self.use_attention: self.attention_mechanism = TokenAttention(attention_input_dims, self.query_dims, condition_attention, tokenwise_attention) if embeddings is None: self.embedding = nn.Embedding(vocab_size, embedding_dims) else: self.embedding = embeddings # we need to map word embedding inputs to transformer hidden dims self.projection_layer = nn.Linear(self.embedding.weight.shape[1], d_model) # 'hidden_size', 'num_layers', 'num_head', 'head_dim' # @TODO I do not understand what head_dim is... self.st = StarTransformer(d_model, N, h, d_model)
def __init__(self, vocab_size, embeddings: nn.Embedding=None, embedding_dim=None, use_attention=False, condition_attention=False, tokenwise_attention=False, query_dims=None): super(CBoWEncoder, self).__init__() self.vocab_size = vocab_size if embeddings is None: assert embedding_dim is not None, "If no embeddings are defined, we must at least define the input dimension!" self.embedding = nn.Embedding(vocab_size, embedding_dim) else: self.embedding = embeddings self.embedding_dim = embeddings.embedding_dim self.use_attention = use_attention self.condition_attention = condition_attention self.query_dims = query_dims if self.use_attention: self.attention_mechanism = TokenAttention(self.embedding_dim, self.query_dims, condition_attention, tokenwise_attention)