def __init__(self, *, capacity_factor: float, drop_tokens: bool, is_scale_prob: bool, n_experts: int, expert: FeedForward, d_model: int): """ * `capacity_factor` is the capacity of each expert as a factor relative to ideally balanced load * `drop_tokens` specifies whether to drop tokens if more tokens are routed to an expert than the capacity * `is_scale_prob` specifies whether to multiply the input to the FFN by the routing probability * `n_experts` is the number of experts * `expert` is the expert layer, a [FFN module](../feed_forward.html) * `d_model` is the number of features in a token embedding * `d_ff` is the number of features in the hidden layer of the FFN * `dropout` is dropout probability in the FFN """ super().__init__() self.capacity_factor = capacity_factor self.is_scale_prob = is_scale_prob self.n_experts = n_experts self.drop_tokens = drop_tokens # make copies of the FFNs self.experts = clone_module_list(expert, n_experts) # Routing layer and softmax self.switch = nn.Linear(d_model, n_experts) self.softmax = nn.Softmax(dim=-1)
def __init__(self, transformer_layer: TransformerLayer, n_layers: int, patch_emb: PatchEmbeddings, pos_emb: LearnedPositionalEmbeddings, classification: ClassificationHead): """ * `transformer_layer` is a copy of a single [transformer layer](../models.html#TransformerLayer). We make copies of it to make the transformer with `n_layers`. * `n_layers` is the number of [transformer layers]((../models.html#TransformerLayer). * `patch_emb` is the [patch embeddings layer](#PatchEmbeddings). * `pos_emb` is the [positional embeddings layer](#LearnedPositionalEmbeddings). * `classification` is the [classification head](#ClassificationHead). """ super().__init__() # Patch embeddings self.patch_emb = patch_emb self.pos_emb = pos_emb # Classification head self.classification = classification # Make copies of the transformer layer self.transformer_layers = clone_module_list(transformer_layer, n_layers) # `[CLS]` token embedding self.cls_token_emb = nn.Parameter(torch.randn(1, 1, transformer_layer.size), requires_grad=True) # Final normalization layer self.ln = nn.LayerNorm([transformer_layer.size])
def __init__(self, layer: FeedbackTransformerLayer, n_layers: int, d_model: int, heads: int): """ * `layer` is the feedback transformer layer, which we clone for each layer * `n_layers` is the number of layers in the transformer * `d_model` is the number of features in the transformer * 'heads' is the number of attention heads """ super().__init__() # Make copies of the transformer layer self.layers = clone_module_list(layer, n_layers) # Final normalization layer self.norm = nn.LayerNorm([layer.size]) # Memory vectors are computed as a weighted sum of representations of each layer. # This is the weights parameter for that. self.weights = nn.Parameter(torch.ones(n_layers + 1), requires_grad=True) # Softmax for weights before taking the weighted sum self.softmax = nn.Softmax(0) # Number of features in a head d_k = d_model // heads # Module to transform embeddings (memory) to get keys self.key = PrepareForMultiHeadAttention(d_model, heads, d_k, bias=False) # Module to transform embeddings (memory) to get keys self.value = PrepareForMultiHeadAttention(d_model, heads, d_k, bias=False) # Memory for stacked keys self.mem_key = Stack(512) # Memory for stacked values self.mem_value = Stack(512)
def __init__(self, layer: FastWeightsAttentionTransformerLayer, n_layers: int): super().__init__() # Make copies of the transformer layer self.layers = clone_module_list(layer, n_layers) # Final normalization layer self.norm = nn.LayerNorm([layer.size])
def __init__(self, layer: FeedbackTransformerLayer, n_layers: int): super().__init__() # Make copies of the transformer layer self.layers = clone_module_list(layer, n_layers) self.norm = nn.LayerNorm([layer.size]) self.weights = nn.Parameter(torch.ones(n_layers + 1), requires_grad=True) self.softmax = nn.Softmax(0)
def __init__(self, layer: FeedbackTransformerLayer, n_layers: int): """ * `layer` is the feedback transformer layer, which we clone for each layer * `n_layers` is the number of layers in the transformer """ super().__init__() # Make copies of the transformer layer self.layers = clone_module_list(layer, n_layers) # Final normalization layer self.norm = nn.LayerNorm([layer.size]) # Memory vectors are computed as a weighted sum of representations of each layer. # This is the weights parameter for that. self.weights = nn.Parameter(torch.ones(n_layers + 1), requires_grad=True) # Softmax for weights before taking the weighted sum self.softmax = nn.Softmax(0)
def __init__(self, conv_mixer_layer: ConvMixerLayer, n_layers: int, patch_emb: PatchEmbeddings, classification: ClassificationHead): """ * `conv_mixer_layer` is a copy of a single [ConvMixer layer](#ConvMixerLayer). We make copies of it to make ConvMixer with `n_layers`. * `n_layers` is the number of ConvMixer layers (or depth), $d$. * `patch_emb` is the [patch embeddings layer](#PatchEmbeddings). * `classification` is the [classification head](#ClassificationHead). """ super().__init__() # Patch embeddings self.patch_emb = patch_emb # Classification head self.classification = classification # Make copies of the [ConvMixer layer](#ConvMixerLayer) self.conv_mixer_layers = clone_module_list(conv_mixer_layer, n_layers)
def __init__(self, layer: TransformerLayer, n_layers: int): super().__init__() # Make copies of the transformer layer self.layers = clone_module_list(layer, n_layers) self.norm = nn.LayerNorm([layer.size])