def __init__(self, **kwargs):
     super(AModel, self).__init__(**kwargs)
     self.embedding = keras.layers.Embedding(input_dim=5, output_dim=3, mask_zero=True)
     self.attention = AttentionLayer(num_heads=5, size_per_head=3)
     self.timedist  = keras.layers.TimeDistributed(MaskFlatten())
     self.bigru = keras.layers.Bidirectional(keras.layers.GRU(units=8))
     self.softmax = keras.layers.Dense(units=2, activation="softmax")
    def test_attention(self):
        max_seq_len = random.randint(5, 10)
        count = 0
        for data, tag in self.data_generator(4, max_seq_len):
            count += 1
            print(data, tag)
            if count > 2:
                break

        class AModel(keras.models.Model):
            def __init__(self, **kwargs):
                super(AModel, self).__init__(**kwargs)
                self.embedding = keras.layers.Embedding(input_dim=5, output_dim=3, mask_zero=True)
                self.attention = AttentionLayer(num_heads=5, size_per_head=3)
                self.timedist  = keras.layers.TimeDistributed(MaskFlatten())
                self.bigru = keras.layers.Bidirectional(keras.layers.GRU(units=8))
                self.softmax = keras.layers.Dense(units=2, activation="softmax")

            #def build(self, input_shape):
            #    super(AModel,self).build(input_shape)

            def call(self, inputs, training=None, mask=None):
                out = inputs
                out = self.embedding(out)
                out = self.attention(out)
                out = self.timedist(out)
                out = self.bigru(out)
                out = self.softmax(out)
                return out

        model = keras.models.Sequential([
            keras.layers.Embedding(input_dim=5, output_dim=3, mask_zero=True),
            AttentionLayer(num_heads=5, size_per_head=3),
            keras.layers.TimeDistributed(MaskFlatten()),
            keras.layers.Bidirectional(keras.layers.GRU(units=8)),
            keras.layers.Dense(units=2, activation="softmax")
        ])

        #model = AModel()
        model.compile(optimizer=keras.optimizers.Adam(lr=0.003),
                      loss=keras.losses.sparse_categorical_crossentropy,
                      metrics=[keras.metrics.sparse_categorical_accuracy])
        # model.build(input_shape=(None, max_seq_len))

        model.build()
        model.summary()

        model.fit_generator(
            generator=self.data_generator(64, max_seq_len),
            steps_per_epoch=100,
            epochs=100,
            validation_data=self.data_generator(8, max_seq_len),
            validation_steps=10,
            callbacks=[
                keras.callbacks.EarlyStopping(monitor='val_sparse_categorical_accuracy', patience=5),
            ],
        )
    def build(self, input_shape):
        self.input_spec = keras.layers.InputSpec(shape=input_shape)

        self.attention_layer = AttentionLayer.from_params(
            self.params,
            size_per_head=self.size_per_head,
            name="self",
        )
        self.attention_projector = ProjectionLayer.from_params(
            self.params,
            name="output",
        )

        super(TransformerSelfAttentionLayer, self).build(input_shape)