def setUpClass(cls):
     """
     Avoid redundant, time-consuming, equivalent setups when testing across
     the different methods, that can use common instantiations.
     """
     feedforward_layer = PositionWiseFeedForward(
         token_representation_dimension=REPRESENTATION_DIMENSION,
         feedforward_dimension=FEEDFORWARD_DIMENSION,
         dropout_prob=DROPOUT_PROB)
     multi_head_attention_later = MultiHeadAttention(
         n_attention_heads=N_ATTENTION_HEADS,
         token_representation_dimension=REPRESENTATION_DIMENSION,
         dropout_prob=DROPOUT_PROB)
     cls.layer = EncoderBlock(building_blocks=EncoderBlockBuildingBlocks(
         self_multi_head_attention_layer=deepcopy(
             multi_head_attention_later),
         fully_connected_layer=feedforward_layer),
                              feature_dimension=REPRESENTATION_DIMENSION,
                              dropout_prob=DROPOUT_PROB)
     cls.forward_propagation_kwargs = {
         'src_features':
         torch_rand(size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH,
                          REPRESENTATION_DIMENSION),
                    dtype=torch_float),
         'src_mask':
         torch_rand(size=(MINI_BATCH_SIZE, 1, MAX_SEQUENCE_LENGTH),
                    dtype=torch_float)
     }
     cls.expected_output_shapes = [(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH,
                                    REPRESENTATION_DIMENSION)]
     cls.expected_output_dtypes = [torch_float]
 def setUpClass(cls):
     """
     Avoid redundant, time-consuming, equivalent setups when testing across
     the different methods, that can use common instantiations.
     """
     cls.layer = PositionWiseFeedForward(
         token_representation_dimension=REPRESENTATION_DIMENSION,
         feedforward_dimension=FEEDFORWARD_DIMENSION,
         dropout_prob=DROPOUT_PROB)
     cls.forward_propagation_kwargs = {
         'features':
         torch_rand(size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH,
                          REPRESENTATION_DIMENSION),
                    dtype=torch_float)
     }
     cls.expected_output_shapes = [(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH,
                                    REPRESENTATION_DIMENSION)]
     cls.expected_output_dtypes = [torch_float]
 def setUpClass(cls):
     """
     Avoid redundant, time-consuming, equivalent setups when testing across
     the different methods, that can use common instantiations.
     """
     positional_encoding_layer = PositionalEncoding(
         token_representation_dimension=REPRESENTATION_DIMENSION,
         dropout_prob=DROPOUT_PROB,
         max_sequence_length=MAX_SEQUENCE_LENGTH)
     src_embedder = Sequential(
         Embedder(token_representation_dimension=REPRESENTATION_DIMENSION,
                  vocabulary_dimension=SRC_VOCABULARY_DIMENSION),
         deepcopy(positional_encoding_layer))
     tgt_embedder = Sequential(
         Embedder(token_representation_dimension=REPRESENTATION_DIMENSION,
                  vocabulary_dimension=TGT_VOCABULARY_DIMENSION),
         deepcopy(positional_encoding_layer))
     feedforward_layer = PositionWiseFeedForward(
         token_representation_dimension=REPRESENTATION_DIMENSION,
         feedforward_dimension=FEEDFORWARD_DIMENSION,
         dropout_prob=DROPOUT_PROB)
     multi_head_attention_later = MultiHeadAttention(
         n_attention_heads=N_ATTENTION_HEADS,
         token_representation_dimension=REPRESENTATION_DIMENSION,
         dropout_prob=DROPOUT_PROB)
     encoder = Encoder(base_block=EncoderBlock(
         building_blocks=EncoderBlockBuildingBlocks(
             self_multi_head_attention_layer=deepcopy(
                 multi_head_attention_later),
             fully_connected_layer=feedforward_layer),
         feature_dimension=REPRESENTATION_DIMENSION,
         dropout_prob=DROPOUT_PROB),
                       n_clones=N_ENCODER_BLOCKS)
     decoder = Decoder(base_block=DecoderBlock(
         building_blocks=DecoderBlockBuildingBlocks(
             self_multi_head_attention_layer=deepcopy(
                 multi_head_attention_later),
             source_multi_head_attention_layer=deepcopy(
                 multi_head_attention_later),
             fully_connected_layer=feedforward_layer),
         feature_dimension=REPRESENTATION_DIMENSION,
         dropout_prob=DROPOUT_PROB),
                       n_clones=N_DECODER_BLOCKS)
     log_softmax_layer = LogSoftmax(
         token_representation_dimension=REPRESENTATION_DIMENSION,
         vocabulary_dimension=TGT_VOCABULARY_DIMENSION)
     building_blocks = Seq2SeqBuildingBlocks(
         encoder=encoder,
         decoder=decoder,
         src_embedder=src_embedder,
         tgt_embedder=tgt_embedder,
         log_softmax_layer=log_softmax_layer)
     cls.layer = Seq2Seq(building_blocks=building_blocks)
     cls.forward_propagation_kwargs = {
         'src_tokens':
         torch_randint(low=0,
                       high=SRC_VOCABULARY_DIMENSION,
                       size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH),
                       dtype=torch_long),
         'tgt_tokens':
         torch_randint(low=0,
                       high=TGT_VOCABULARY_DIMENSION,
                       size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH - 1),
                       dtype=torch_long),
         'src_mask':
         torch_rand(size=(MINI_BATCH_SIZE, 1, MAX_SEQUENCE_LENGTH),
                    dtype=torch_float),
         'tgt_mask':
         torch_rand(size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH - 1,
                          MAX_SEQUENCE_LENGTH - 1),
                    dtype=torch_float)
     }
     cls.expected_output_shapes = [
         (MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH - 1,
          REPRESENTATION_DIMENSION)
     ]
     cls.expected_output_dtypes = [torch_float]
Beispiel #4
0
    def _build_model_architecture(self) -> None:
        """
        Initializing the Transformer model object instantiated with the
        architecture specified by the input hyperparameters, with newly
        initialized weights.
        """
        # building the architecture:

        # instantiating (some of) the base layers/blocks of the architecture:
        positional_encoding_layer = PositionalEncoding(
            token_representation_dimension=self.representation_dimension,
            dropout_prob=self.dropout_prob,
            max_sequence_length=self.max_sequence_length)
        multi_head_attention_later = MultiHeadAttention(
            n_attention_heads=self.n_attention_heads,
            token_representation_dimension=self.representation_dimension,
            dropout_prob=self.dropout_prob)
        feedforward_layer = PositionWiseFeedForward(
            token_representation_dimension=self.representation_dimension,
            feedforward_dimension=self.feedforward_dimension,
            dropout_prob=self.dropout_prob)
        log_softmax_layer = LogSoftmax(
            token_representation_dimension=self.representation_dimension,
            vocabulary_dimension=self.tgt_vocabulary_dimension)

        # composing some of the base layers to build the more complex ones:
        src_embedder = Sequential(
            Embedder(
                token_representation_dimension=self.representation_dimension,
                vocabulary_dimension=self.src_vocabulary_dimension),
            deepcopy(positional_encoding_layer))
        tgt_embedder = Sequential(
            Embedder(
                token_representation_dimension=self.representation_dimension,
                vocabulary_dimension=self.tgt_vocabulary_dimension),
            deepcopy(positional_encoding_layer))
        base_encoder_block = EncoderBlock(
            building_blocks=EncoderBlockBuildingBlocks(
                self_multi_head_attention_layer=deepcopy(
                    multi_head_attention_later),
                fully_connected_layer=deepcopy(feedforward_layer),
            ),
            feature_dimension=self.representation_dimension,
            dropout_prob=self.dropout_prob)
        encoder = Encoder(base_block=base_encoder_block,
                          n_clones=self.n_encoder_blocks)
        base_decoder_block = DecoderBlock(
            building_blocks=DecoderBlockBuildingBlocks(
                self_multi_head_attention_layer=deepcopy(
                    multi_head_attention_later),
                source_multi_head_attention_layer=deepcopy(
                    multi_head_attention_later),
                fully_connected_layer=deepcopy(feedforward_layer)),
            feature_dimension=self.representation_dimension,
            dropout_prob=self.dropout_prob)
        decoder = Decoder(base_block=base_decoder_block,
                          n_clones=self.n_decoder_blocks)

        # instantiating the whole seq2seq encoder-decoder model:
        building_blocks = Seq2SeqBuildingBlocks(
            encoder=encoder,
            decoder=decoder,
            src_embedder=src_embedder,
            tgt_embedder=tgt_embedder,
            log_softmax_layer=log_softmax_layer)
        model = Seq2Seq(building_blocks=building_blocks)

        # initializing the parameters:

        # for each layer's parameter set:
        for parameter in model.parameters():
            # TODO: explain why:
            if parameter.dim() > 1:
                # parameters initialized following Xavier initialization:
                xavier_uniform_(parameter)

        self.model = model