Exemple #1
0
    def __init__(self, *, capacity_factor: float, drop_tokens: bool,
                 is_scale_prob: bool, n_experts: int, expert: FeedForward,
                 d_model: int):
        """
        * `capacity_factor` is the capacity of each expert as a factor relative to ideally balanced load
        * `drop_tokens` specifies whether to drop tokens if more tokens are routed to an expert than the capacity
        * `is_scale_prob` specifies whether to multiply the input to the FFN by the routing probability
        * `n_experts` is the number of experts
        * `expert` is the expert layer, a [FFN module](../feed_forward.html)
        * `d_model` is the number of features in a token embedding
        * `d_ff` is the number of features in the hidden layer of the FFN
        * `dropout` is dropout probability in the FFN
        """
        super().__init__()

        self.capacity_factor = capacity_factor
        self.is_scale_prob = is_scale_prob
        self.n_experts = n_experts
        self.drop_tokens = drop_tokens

        # make copies of the FFNs
        self.experts = clone_module_list(expert, n_experts)
        # Routing layer and softmax
        self.switch = nn.Linear(d_model, n_experts)
        self.softmax = nn.Softmax(dim=-1)
Exemple #2
0
    def __init__(self, transformer_layer: TransformerLayer, n_layers: int,
                 patch_emb: PatchEmbeddings,
                 pos_emb: LearnedPositionalEmbeddings,
                 classification: ClassificationHead):
        """
        * `transformer_layer` is a copy of a single [transformer layer](../models.html#TransformerLayer).
         We make copies of it to make the transformer with `n_layers`.
        * `n_layers` is the number of [transformer layers]((../models.html#TransformerLayer).
        * `patch_emb` is the [patch embeddings layer](#PatchEmbeddings).
        * `pos_emb` is the [positional embeddings layer](#LearnedPositionalEmbeddings).
        * `classification` is the [classification head](#ClassificationHead).
        """
        super().__init__()
        # Patch embeddings
        self.patch_emb = patch_emb
        self.pos_emb = pos_emb
        # Classification head
        self.classification = classification
        # Make copies of the transformer layer
        self.transformer_layers = clone_module_list(transformer_layer,
                                                    n_layers)

        # `[CLS]` token embedding
        self.cls_token_emb = nn.Parameter(torch.randn(1, 1,
                                                      transformer_layer.size),
                                          requires_grad=True)
        # Final normalization layer
        self.ln = nn.LayerNorm([transformer_layer.size])
Exemple #3
0
    def __init__(self, layer: FeedbackTransformerLayer, n_layers: int, d_model: int, heads: int):
        """
        * `layer` is the feedback transformer layer, which we clone for each layer
        * `n_layers` is the number of layers in the transformer
        * `d_model` is the number of features in the transformer
        * 'heads' is the number of attention heads
        """

        super().__init__()
        # Make copies of the transformer layer
        self.layers = clone_module_list(layer, n_layers)
        # Final normalization layer
        self.norm = nn.LayerNorm([layer.size])
        # Memory vectors are computed as a weighted sum of representations of each layer.
        # This is the weights parameter for that.
        self.weights = nn.Parameter(torch.ones(n_layers + 1), requires_grad=True)
        # Softmax for weights before taking the weighted sum
        self.softmax = nn.Softmax(0)

        # Number of features in a head
        d_k = d_model // heads
        # Module to transform embeddings (memory) to get keys
        self.key = PrepareForMultiHeadAttention(d_model, heads, d_k, bias=False)
        # Module to transform embeddings (memory) to get keys
        self.value = PrepareForMultiHeadAttention(d_model, heads, d_k, bias=False)

        # Memory for stacked keys
        self.mem_key = Stack(512)
        # Memory for stacked values
        self.mem_value = Stack(512)
Exemple #4
0
 def __init__(self, layer: FastWeightsAttentionTransformerLayer,
              n_layers: int):
     super().__init__()
     # Make copies of the transformer layer
     self.layers = clone_module_list(layer, n_layers)
     # Final normalization layer
     self.norm = nn.LayerNorm([layer.size])
Exemple #5
0
 def __init__(self, layer: FeedbackTransformerLayer, n_layers: int):
     super().__init__()
     # Make copies of the transformer layer
     self.layers = clone_module_list(layer, n_layers)
     self.norm = nn.LayerNorm([layer.size])
     self.weights = nn.Parameter(torch.ones(n_layers + 1), requires_grad=True)
     self.softmax = nn.Softmax(0)
Exemple #6
0
    def __init__(self, layer: FeedbackTransformerLayer, n_layers: int):
        """
        * `layer` is the feedback transformer layer, which we clone for each layer
        * `n_layers` is the number of layers in the transformer
        """

        super().__init__()
        # Make copies of the transformer layer
        self.layers = clone_module_list(layer, n_layers)
        # Final normalization layer
        self.norm = nn.LayerNorm([layer.size])
        # Memory vectors are computed as a weighted sum of representations of each layer.
        # This is the weights parameter for that.
        self.weights = nn.Parameter(torch.ones(n_layers + 1), requires_grad=True)
        # Softmax for weights before taking the weighted sum
        self.softmax = nn.Softmax(0)
Exemple #7
0
 def __init__(self, conv_mixer_layer: ConvMixerLayer, n_layers: int,
              patch_emb: PatchEmbeddings,
              classification: ClassificationHead):
     """
     * `conv_mixer_layer` is a copy of a single [ConvMixer layer](#ConvMixerLayer).
      We make copies of it to make ConvMixer with `n_layers`.
     * `n_layers` is the number of ConvMixer layers (or depth), $d$.
     * `patch_emb` is the [patch embeddings layer](#PatchEmbeddings).
     * `classification` is the [classification head](#ClassificationHead).
     """
     super().__init__()
     # Patch embeddings
     self.patch_emb = patch_emb
     # Classification head
     self.classification = classification
     # Make copies of the [ConvMixer layer](#ConvMixerLayer)
     self.conv_mixer_layers = clone_module_list(conv_mixer_layer, n_layers)
Exemple #8
0
 def __init__(self, layer: TransformerLayer, n_layers: int):
     super().__init__()
     # Make copies of the transformer layer
     self.layers = clone_module_list(layer, n_layers)
     self.norm = nn.LayerNorm([layer.size])