def test_topk_equals_length_attention_masked(self):
     d_model = 32
     n_heads = 4
     improved_transformer = TransformerEncoder([
         TransformerEncoderLayer(
             AttentionLayer(
                 ImprovedClusteredAttention(clusters=10, topk=20), d_model,
                 n_heads), d_model, n_heads) for i in range(6)
     ])
     full_transformer = TransformerEncoder([
         TransformerEncoderLayer(
             AttentionLayer(FullAttention(), d_model, n_heads), d_model,
             n_heads) for i in range(6)
     ])
     full_transformer = full_transformer.to("cuda")
     improved_transformer = improved_transformer.to("cuda")
     improved_transformer.load_state_dict(full_transformer.state_dict())
     improved_transformer.eval()
     full_transformer.eval()
     x = torch.rand(100, 20, d_model).to("cuda")
     lengths = x.new_full((100, ), 20, dtype=torch.int64)
     lengths[1] = 5
     lengths[10] = 10
     length_mask = LengthMask(lengths=lengths, max_len=20)
     y_full = improved_transformer(x, length_mask=length_mask)
     y_improved = full_transformer(x, length_mask=length_mask)
     self.assertLess(
         torch.max(torch.abs(y_improved[1, :5] - y_full[1, :5])), 1e-4)
     self.assertLess(
         torch.max(torch.abs(y_improved[10, :10] - y_full[10, :10])), 1e-4)
 def test_clustered_attention_forward(self):
     d_model = 128
     n_heads = 4
     transformer = TransformerEncoder([
         TransformerEncoderLayer(
             AttentionLayer(ClusteredAttention(clusters=10), d_model,
                            n_heads), d_model, n_heads) for i in range(6)
     ])
     x = transformer(torch.rand(100, 20, d_model))
     self.assertEqual(x.shape, (100, 20, d_model))
 def test_full_attention_forward(self):
     d_model = 128
     n_heads = 4
     transformer = TransformerEncoder([
         TransformerEncoderLayer(
             AttentionLayer(FullAttention(), d_model, n_heads), d_model,
             n_heads) for i in range(6)
     ])
     x = transformer(torch.rand(10, 7, d_model))
     self.assertEqual(x.shape, (10, 7, d_model))
 def test_improved_clustered_attention_forward(self):
     d_model = 128
     n_heads = 4
     transformer = TransformerEncoder([
         TransformerEncoderLayer(
             AttentionLayer(ImprovedClusteredAttention(clusters=10, topk=5),
                            d_model, n_heads), d_model, n_heads)
         for i in range(6)
     ])
     x = torch.rand(100, 20, d_model)
     y = transformer(x)
     self.assertEqual(y.shape, (100, 20, d_model))
Ejemplo n.º 5
0
    def test_full_attention_forward(self):
        d_model = 128
        n_heads = 4
        transformer = TransformerEncoder([
            TransformerEncoderLayer(
                AttentionLayer(ClusteredAttention(clusters=10), d_model,
                               n_heads), d_model, n_heads) for i in range(6)
        ])

        transformer = transformer.to("cuda")
        x = torch.rand(100, 20, d_model).to("cuda")
        y = transformer(x)
        self.assertEqual(y.shape, (100, 20, d_model))
 def test_topk_equals_length_attention(self):
     d_model = 32
     n_heads = 4
     improved_transformer = TransformerEncoder([
         TransformerEncoderLayer(
             AttentionLayer(
                 ImprovedClusteredAttention(clusters=10, topk=20), d_model,
                 n_heads), d_model, n_heads) for i in range(6)
     ])
     full_transformer = TransformerEncoder([
         TransformerEncoderLayer(
             AttentionLayer(FullAttention(), d_model, n_heads), d_model,
             n_heads) for i in range(6)
     ])
     full_transformer = full_transformer.to("cuda")
     improved_transformer = improved_transformer.to("cuda")
     improved_transformer.load_state_dict(full_transformer.state_dict())
     improved_transformer.eval()
     full_transformer.eval()
     x = torch.rand(100, 20, d_model).to("cuda")
     y_full = improved_transformer(x)
     y_improved = full_transformer(x)
     self.assertLess(torch.max(torch.abs(y_improved - y_full)), 1e-4)
 def test_improved_clustered_attention_forward(self):
     d_model = 128
     n_heads = 4
     transformer = TransformerEncoder([
         TransformerEncoderLayer(
             AttentionLayer(
                 ReformerAttention(
                     chunk_size=32,
                     rounds=4,
                     bits=8,
                     masked=False,
                 ), d_model, n_heads), d_model, n_heads) for i in range(6)
     ])
     x = torch.rand(12, 128, d_model)
     y = transformer(x)
     self.assertEqual(y.shape, (12, 128, d_model))
Ejemplo n.º 8
0
    def __init__(self,
                 n_layer,
                 n_head,
                 d_model,
                 d_ff,
                 dropout=0.1,
                 activation='relu',
                 favor_feature_dims=None):
        super(FastTransformerDecoder, self).__init__()
        self.n_layer = n_layer
        self.n_head = n_head
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout
        self.activation = activation

        self.favor_feature_dims = 2 * d_model // n_head \
                                  if favor_feature_dims is None else favor_feature_dims
        att_builder = AttentionBuilder.from_kwargs(
            query_dimensions=d_model // n_head,
            feature_map=Favor.factory(n_dims=self.favor_feature_dims))

        self.attention_layers = [
            AttentionLayer(att_builder.get("causal-linear"),
                           d_model,
                           n_head,
                           positional_encoder=None) for l in range(n_layer)
        ]

        self.decoder_layers = nn.ModuleList()
        for l in range(n_layer):
            self.decoder_layers.append(
                TransformerEncoderLayer(attention=self.attention_layers[l],
                                        d_model=d_model,
                                        d_ff=d_ff,
                                        dropout=dropout,
                                        activation=activation))
Ejemplo n.º 9
0
  def __init__(self, n_layer, n_head, d_model, d_ff, 
    dropout=0.1, 
    activation='relu',
    favor_feature_dims=None,
    spe_module=None,
    share_pe=False,
    share_spe_filter=False,
    use_gated_filter=True,
    spe_module_params=None
  ):
    super(SPEFastTransformerDecoder, self).__init__()
    self.n_layer = n_layer
    self.n_head = n_head
    self.d_model = d_model
    self.d_ff = d_ff
    self.dropout = dropout
    self.activation = activation
    self.share_pe = share_pe
    self.use_gated_filter = use_gated_filter
    self.share_spe_filter = share_spe_filter

    self.spe_module = spe_module
    self._spe = None
    self._spe_filters = None

    if share_pe:
      self.spe = self.spe_module(
                        num_heads=n_head, 
                        **(spe_module_params or {})
                      )
      self._spe = n_layer * [self.spe]
    else:
      self.spe = nn.ModuleList([
        self.spe_module(
              num_heads=n_head,
              in_features=d_model // n_head,
              **(spe_module_params or {})
            )
        for _ in range(n_layer)
      ])
      self._spe = list(self.spe)

    if share_spe_filter:
      self.spe_filters = SPEFilter(
                          code_shape=self._spe[0].code_shape, 
                          gated=use_gated_filter
                        )
      self._spe_filters = n_layer * [self.spe_filters]
    else:
      self.spe_filters = nn.ModuleList([
        SPEFilter(
          code_shape=pe.code_shape, 
          gated=use_gated_filter
        ) 
        for pe in self._spe
      ])
      self._spe_filters = list(self.spe_filters)

    self.favor_feature_dims = 2 * d_model // n_head \
                              if favor_feature_dims is None else favor_feature_dims
    att_builder = AttentionBuilder.from_kwargs(
      query_dimensions=d_model // n_head,
      feature_map=Favor.factory(n_dims=self.favor_feature_dims)
    )

    self.attention_layers = [
        AttentionLayer(
          att_builder.get("causal-linear"), 
          d_model, 
          n_head,
          positional_encoder=self._spe_filters[l].__call__
        )
        for l in range(n_layer)
    ]

    self.decoder_layers = nn.ModuleList()
    for l in range(n_layer):
      self.decoder_layers.append(
        TransformerEncoderLayer(
          attention=self.attention_layers[l],
          d_model=d_model, 
          d_ff=d_ff, 
          dropout=dropout, 
          activation=activation
        )
      )
Ejemplo n.º 10
0
    def __init__(self,
                 n_layer,
                 n_head,
                 d_model,
                 d_ff,
                 dropout=0.1,
                 activation='relu',
                 share_pe=False,
                 share_spe_filter=False):
        super(FastTransformerDecoder, self).__init__()
        self.n_layer = n_layer
        self.n_head = n_head
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout
        self.activation = activation
        self.share_pe = share_pe
        self.share_spe_filter = share_spe_filter

        self._spe = None
        self._spe_filters = None
        if 'positional_encoder' in self._cfg:
            make_pe = self._cfg['positional_encoder'].bind(num_heads=n_head)
            if share_pe:
                self.spe = make_pe()  # Register as a module (only once!)
                self._spe = n_layer * [self.spe]
            else:
                # Make an SPE encoder for each layer and register them all
                self.spe = nn.ModuleList([make_pe() for _ in range(n_layer)])
                self._spe = list(self.spe)

            make_filter = self._cfg['spe_filter'].bind(spe.SPEFilter)
            if share_spe_filter:
                self.spe_filters = make_filter(
                    code_shape=self._spe[0].code_shape)
                self._spe_filters = n_layer * [self.spe_filters]
            else:
                # Make a filter for each layer, register them
                self.spe_filters = nn.ModuleList([
                    make_filter(code_shape=pe.code_shape) for pe in self._spe
                ])
                self._spe_filters = list(self.spe_filters)

        self.attention_layers = [
            AttentionLayer(
                self._cfg['attention'].configure(
                    CausalLinearAttention,
                    query_dimensions=d_model // n_head,
                    feature_map=self._cfg['feature_map'].configure(
                        Favor.factory, n_dims=d_model // n_head)),
                d_model,
                n_head,
                # Do not register as submodules of the layer
                positional_encoder=(self._spe_filters[l].__call__
                                    if self._spe_filters else None))
            for l in range(n_layer)
        ]

        self.decoder_layers = nn.ModuleList()
        for l in range(n_layer):
            self.decoder_layers.append(
                TransformerEncoderLayer(attention=self.attention_layers[l],
                                        d_model=d_model,
                                        d_ff=d_ff,
                                        dropout=dropout,
                                        activation=activation))