Beispiel #1
0
    def __init__(self, **kwargs):
        TrainableNM.__init__(self, **kwargs)

        params = self.local_parameters
        embedding_params = {
            "vocab_size":
            params["vocab_size"],
            "hidden_size":
            params["d_model"],
            "max_sequence_length":
            params["max_seq_length"],
            "embedding_dropout":
            params.get("embedding_dropout", 0),
            "learn_positional_encodings":
            params.get("learn_positional_encodings", False)
        }
        backbone_params = {
            "num_layers": params["num_layers"],
            "hidden_size": params["d_model"],
            "mask_future": params.get("mask_future", False),
            "num_attention_heads": params["num_attn_heads"],
            "inner_size": params["d_inner"],
            "ffn_dropout": params.get("ffn_dropout", 0),
            "hidden_act": params.get("hidden_act", "relu"),
            "attn_score_dropout": params.get("attn_score_dropout", 0),
            "attn_layer_dropout": params.get("attn_layer_dropout", 0)
        }

        self.embedding_layer = TransformerEmbedding(**embedding_params)
        self.encoder = TransformerEncoder(**backbone_params)

        std_init_range = 1 / math.sqrt(params["d_model"])
        self.apply(
            lambda module: transformer_weights_init(module, std_init_range))
        self.to(self._device)
Beispiel #2
0
    def __init__(self,
                 decoder,
                 log_softmax,
                 max_seq_length,
                 pad_token,
                 bos_token,
                 eos_token,
                 batch_size=1,
                 beam_size=4,
                 max_delta_length=50,
                 length_penalty=0,
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self.generator = BeamSearchSequenceGenerator(
            decoder.embedding_layer,
            decoder.decoder,
            log_softmax,
            max_sequence_length=max_seq_length,
            max_delta_length=max_delta_length,
            pad=pad_token,
            bos=bos_token,
            eos=eos_token,
            batch_size=batch_size,
            beam_size=beam_size,
            len_pen=length_penalty,
        )
Beispiel #3
0
    def __init__(self,
                 *,
                 feat_in,
                 num_classes,
                 init_mode="xavier_uniform",
                 return_logits=True,
                 pooling_type='avg',
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self._feat_in = feat_in
        self._return_logits = return_logits
        self._num_classes = num_classes

        if pooling_type == 'avg':
            self.pooling = nn.AdaptiveAvgPool1d(1)
        elif pooling_type == 'max':
            self.pooling = nn.AdaptiveMaxPool1d(1)
        else:
            raise ValueError(
                'Pooling type chosen is not valid. Must be either `avg` or `max`'
            )

        self.decoder_layers = nn.Sequential(
            nn.Linear(self._feat_in, self._num_classes, bias=True))
        self.apply(lambda x: init_weights(x, mode=init_mode))
        self.to(self._device)
Beispiel #4
0
    def __init__(self, feat_in, num_classes, emb_sizes=[1024, 1024], pool_mode='xvector', init_mode="xavier_uniform"):
        TrainableNM.__init__(self)
        self._feat_in = 0
        if pool_mode == 'gram':
            gram = True
            super_vector = False
        elif pool_mode == 'superVector':
            gram = True
            super_vector = True
        else:
            gram = False
            super_vector = False

        if gram:
            self._feat_in += feat_in ** 2
        else:
            self._feat_in += 2 * feat_in

        if super_vector and gram:
            self._feat_in += 2 * feat_in

        self._midEmbd1 = int(emb_sizes[0])  # Spkr Vector Embedding Shape
        self._midEmbd2 = int(emb_sizes[1]) if len(emb_sizes) > 1 else 0  # Spkr Vector Embedding Shape

        self._num_classes = num_classes
        self._pooling = StatsPoolLayer(gram=gram, super_vector=super_vector)

        self.mid1 = self.affineLayer(self._feat_in, self._midEmbd1, learn_mean=False)
        self.mid2 = self.affineLayer(self._midEmbd1, self._midEmbd2, learn_mean=False)
        self.final = nn.Linear(self._midEmbd2, self._num_classes)

        self.apply(lambda x: init_weights(x, mode=init_mode))
        self.to(self._device)
Beispiel #5
0
    def __init__(self,
                 *,
                 pretrained_model_name=None,
                 config_filename=None,
                 vocab_size=None,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 max_position_embeddings=512,
                 random_init=False,
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        # Check that only one of pretrained_model_name, config_filename, and
        # vocab_size was passed in
        total = 0

        if pretrained_model_name is not None:
            total += 1
        if config_filename is not None:
            total += 1
        if vocab_size is not None:
            total += 1

        if total != 1:
            raise ValueError(
                "Only one of pretrained_model_name, vocab_size, " +
                "or config_filename should be passed into the " +
                "BERT constructor.")

        if vocab_size is not None:
            config = BertConfig(
                vocab_size_or_config_json_file=vocab_size,
                hidden_size=hidden_size,
                num_hidden_layers=num_hidden_layers,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                hidden_act=hidden_act,
                max_position_embeddings=max_position_embeddings)
            model = BertModel(config)
        elif pretrained_model_name is not None:
            model = BertModel.from_pretrained(pretrained_model_name)
        elif config_filename is not None:
            config = BertConfig.from_json_file(config_filename)
            model = BertModel(config)
        else:
            raise ValueError(
                "Either pretrained_model_name or vocab_size must" +
                "be passed into the BERT constructor")

        model.to(self._device)

        self.add_module("bert", model)
        self.config = model.config

        if random_init:
            self.apply(
                lambda module: transformer_weights_init(module, xavier=False))
Beispiel #6
0
    def __init__(self, **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self.hidden_size = self.local_parameters["d_model"]

        self.qa_outputs = nn.Linear(self.hidden_size, 2)
        self.qa_outputs.apply(transformer_weights_init)
        self.qa_outputs.to(self._device)
Beispiel #7
0
    def __init__(self, *, vocab_size, d_model, **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self.log_softmax = TransformerLogSoftmax(vocab_size=vocab_size,
                                                 hidden_size=d_model)

        self.log_softmax.apply(transformer_weights_init)
        self.log_softmax.to(self._device)
Beispiel #8
0
    def __init__(self, *, d_model, num_classes, **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self.log_softmax = ClassificationLogSoftmax(hidden_size=d_model,
                                                    num_classes=num_classes)

        self.log_softmax.apply(transformer_weights_init)
        self.log_softmax.to(self._device)
Beispiel #9
0
    def __init__(self,
                 *,
                 pretrained_model_name=None,
                 config_filename=None,
                 vocab_size=None,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 max_position_embeddings=512,
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        # Check that only one of pretrained_model_name, config_filename, and
        # vocab_size was passed in
        total = 0
        if pretrained_model_name is not None:
            total += 1
        if config_filename is not None:
            total += 1
        if vocab_size is not None:
            total += 1

        if total != 1:
            raise ValueError(
                "Only one of pretrained_model_name, vocab_size, " +
                "or config_filename should be passed into the " +
                "BERT constructor.")

        if vocab_size is not None:
            config = BertConfig(
                vocab_size_or_config_json_file=vocab_size,
                vocab_size=vocab_size,
                hidden_size=hidden_size,
                num_hidden_layers=num_hidden_layers,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                hidden_act=hidden_act,
                max_position_embeddings=max_position_embeddings,
            )
            model = BertModel(config)
        elif pretrained_model_name is not None:
            model = BertModel.from_pretrained(pretrained_model_name)
        elif config_filename is not None:
            config = BertConfig.from_json_file(config_filename)
            model = BertModel(config)
        else:
            raise ValueError(
                "Either pretrained_model_name or vocab_size must" +
                " be passed into the BERT constructor")

        model.to(self._device)

        self.add_module("bert", model)
        self.config = model.config
        for key, value in self.config.to_dict().items():
            self._local_parameters[key] = value
Beispiel #10
0
    def __init__(
        self,
        *,
        jasper,
        activation,
        feat_in,
        normalization_mode="batch",
        residual_mode="add",
        norm_groups=-1,
        conv_mask=True,
        frame_splicing=1,
        init_mode='xavier_uniform',
        **kwargs
    ):
        TrainableNM.__init__(self, **kwargs)

        activation = jasper_activations[activation]()
        feat_in = feat_in * frame_splicing

        residual_panes = []
        encoder_layers = []
        self.dense_residual = False
        for lcfg in jasper:
            dense_res = []
            if lcfg.get('residual_dense', False):
                residual_panes.append(feat_in)
                dense_res = residual_panes
                self.dense_residual = True
            groups = lcfg.get('groups', 1)
            separable = lcfg.get('separable', False)
            heads = lcfg.get('heads', -1)
            encoder_layers.append(
                JasperBlock(
                    feat_in,
                    lcfg['filters'],
                    repeat=lcfg['repeat'],
                    kernel_size=lcfg['kernel'],
                    stride=lcfg['stride'],
                    dilation=lcfg['dilation'],
                    dropout=lcfg['dropout'],
                    residual=lcfg['residual'],
                    groups=groups,
                    separable=separable,
                    heads=heads,
                    residual_mode=residual_mode,
                    normalization=normalization_mode,
                    norm_groups=norm_groups,
                    activation=activation,
                    residual_panes=dense_res,
                    conv_mask=conv_mask,
                )
            )
            feat_in = lcfg['filters']

        self.encoder = nn.Sequential(*encoder_layers)
        self.apply(lambda x: init_weights(x, mode=init_mode))
        self.to(self._device)
    def __init__(self, *, voc_size, hidden_size, dropout=0.0, **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self.voc_size = voc_size
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.embedding = nn.Embedding(self.voc_size, self.hidden_size)
        if self.dropout != 0.0:
            self.embedding_dropout = nn.Dropout(self.dropout)
    def __init__(self, *, from_dim, to_dim, dropout=0.0, **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self.from_dim = from_dim
        self.to_dim = to_dim
        self.dropout = dropout
        self.projection = nn.Linear(self.from_dim, self.to_dim, bias=False)
        if self.dropout != 0.0:
            self.embedding_dropout = nn.Dropout(self.dropout)
Beispiel #13
0
 def __init__(self, hidden_size, num_classes, dropout, **kwargs):
     TrainableNM.__init__(self, **kwargs)
     self.hidden_size = hidden_size
     self.num_classes = num_classes
     self.dropout = nn.Dropout(dropout)
     self.dense = nn.Linear(self.hidden_size, self.hidden_size)
     self.classifier = nn.Linear(self.hidden_size, self.num_classes)
     self.apply(
         lambda module: transformer_weights_init(module, xavier=False))
     self.to(self._device)
Beispiel #14
0
    def __init__(self, *, feat_in, num_classes, init_mode="xavier_uniform", **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self._feat_in = feat_in
        # Add 1 for blank char
        self._num_classes = num_classes + 1

        self.decoder_layers = nn.Sequential(nn.Conv1d(self._feat_in, self._num_classes, kernel_size=1, bias=True))
        self.apply(lambda x: init_weights(x, mode=init_mode))
        self.to(self._device)
Beispiel #15
0
    def __init__(self, **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self.hidden_size = self.local_parameters["d_model"]
        self.num_labels = self.local_parameters["num_labels"]
        self.dropout = nn.Dropout(self.local_parameters["dropout"])
        self.classifier = nn.Linear(self.hidden_size, self.num_labels)

        self.apply(
            lambda module: transformer_weights_init(module, xavier=False))
        self.to(self._device)
Beispiel #16
0
    def __init__(self, decoder, log_softmax, **kwargs):
        TrainableNM.__init__(self, **kwargs)

        generator_params = {
            "max_sequence_length": self.local_parameters["max_seq_length"],
            "pad": self.local_parameters["pad_token"],
            "bos": self.local_parameters["bos_token"],
            "eos": self.local_parameters["eos_token"],
            "batch_size": self.local_parameters.get("batch_size", 1)
        }
        self.generator = GreedySequenceGenerator(decoder, log_softmax,
                                                 **generator_params)
Beispiel #17
0
    def __init__(self, *, dim, **kwargs):
        # Part specific for Neural Modules API:
        #   (1) call base constructor
        #   (2) define input and output ports
        TrainableNM.__init__(self, **kwargs)

        # And of Neural Modules specific part. Rest is Pytorch code
        self._dim = dim
        self.fc1 = nn.Linear(self._dim, 1)
        t.nn.init.xavier_uniform_(self.fc1.weight)
        self._device = t.device("cuda" if self.placement ==
                                DeviceType.GPU else "cpu")
        self.to(self._device)
Beispiel #18
0
    def __init__(
            self, *,
            sample_rate=16000,
            window_size=0.02,
            window_stride=0.01,
            window="hann",
            normalize="per_feature",
            n_fft=None,
            preemph=0.97,
            features=64,
            lowfreq=0,
            highfreq=None,
            feat_type="logfbank",
            dither=1e-5,
            pad_to=16,
            frame_splicing=1,
            stft_conv=False,
            **kwargs
    ):
        if "fbank" not in feat_type:
            raise NotImplementedError("AudioPreprocessing currently only "
                                      "accepts 'fbank' or 'logfbank' as "
                                      "feat_type")
        TrainableNM.__init__(self, **kwargs)

        self.featurizer = FilterbankFeatures(
            sample_rate=sample_rate,
            window_size=window_size,
            window_stride=window_stride,
            window=window,
            normalize=normalize,
            n_fft=n_fft,
            preemph=preemph,
            nfilt=features,
            lowfreq=lowfreq,
            highfreq=highfreq,
            dither=dither,
            pad_to=pad_to,
            frame_splicing=frame_splicing,
            stft_conv=stft_conv,
            logger=self._logger
        )
        # _pre_procesing_config = self.local_parameters
        # self.featurizer = FeatureFactory.from_config(_pre_procesing_config)
        self.featurizer.to(self._device)

        self.disable_casts = (self._opt_level == Optimization.mxprO1)
Beispiel #19
0
    def __init__(self, decoder, log_softmax, **kwargs):
        TrainableNM.__init__(self, **kwargs)

        params = self.local_parameters
        generator_params = {
            "max_sequence_length": params["max_seq_length"],
            "max_delta_length": params.get("max_delta_length", 50),
            "pad": params["pad_token"],
            "bos": params["bos_token"],
            "eos": params["eos_token"],
            "batch_size": params.get("batch_size", 1),
            "beam_size": params.get("beam_size", 4),
            "len_pen": params.get("length_penalty", 0)
        }
        self.generator = BeamSearchSequenceGenerator(decoder.embedding_layer,
                                                     decoder.decoder,
                                                     log_softmax,
                                                     **generator_params)
Beispiel #20
0
    def __init__(self,
                 decoder,
                 log_softmax,
                 max_seq_length,
                 pad_token,
                 bos_token,
                 eos_token,
                 batch_size=1,
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self.generator = GreedySequenceGenerator(
            decoder.embedding_layer,
            decoder.decoder,
            log_softmax,
            max_sequence_length=max_seq_length,
            pad=pad_token,
            bos=bos_token,
            eos=eos_token,
            batch_size=batch_size)
Beispiel #21
0
    def __init__(self,
                 vocab_size,
                 d_model,
                 d_inner,
                 max_seq_length,
                 num_layers,
                 num_attn_heads,
                 ffn_dropout=0.0,
                 embedding_dropout=0.0,
                 attn_score_dropout=0.0,
                 attn_layer_dropout=0.0,
                 learn_positional_encodings=False,
                 hidden_act='relu',
                 mask_future=False,
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        self.embedding_layer = TransformerEmbedding(
            vocab_size=vocab_size,
            hidden_size=d_model,
            max_sequence_length=max_seq_length,
            embedding_dropout=embedding_dropout,
            learn_positional_encodings=learn_positional_encodings,
        )
        self.encoder = TransformerEncoder(
            num_layers=num_layers,
            hidden_size=d_model,
            mask_future=mask_future,
            num_attention_heads=num_attn_heads,
            inner_size=d_inner,
            ffn_dropout=ffn_dropout,
            hidden_act=hidden_act,
            attn_score_dropout=attn_score_dropout,
            attn_layer_dropout=attn_layer_dropout,
        )

        std_init_range = 1 / math.sqrt(d_model)
        self.apply(
            lambda module: transformer_weights_init(module, std_init_range))
        self.to(self._device)
Beispiel #22
0
 def __init__(self, **kwargs):
     TrainableNM.__init__(self, **kwargs)
Beispiel #23
0
 def __init__(self, **kwargs):
     TrainableNM.__init__(self, **kwargs)
     self._loss_fn = SequenceClassificationLoss()
 def __init__(self, mode="add", **kwargs):
     TrainableNM.__init__(self, **kwargs)
     self._mode = mode
Beispiel #25
0
 def __init__(self, **kwargs):
     TrainableNM.__init__(self, **kwargs)
     label_smoothing = self.local_parameters.get("label_smoothing", 0.0)
     self._loss_fn = SmoothedCrossEntropyLoss(label_smoothing)
Beispiel #26
0
    def __init__(
        self,
        input_size: int,
        output_size: int,
        hidden_sizes: List[int] = [],
        dimensions: int = 2,
        dropout_rate: float = 0,
        name: Optional[str] = None,
    ):
        """
        Initializes the feed-forwad network.

        Args:
            input_size: Size of input (1D)
            output_sizes: Size of the output (1D)
            hidden_sizes: Sizes of the consecutive hidden layers (DEFAULT: [] = no hidden)
            dimensions: Number of dimensions of input/output tensors (DEFAULT: 2 = BATCH X INPUT_SIZE)
            dropout_rate: Dropout rage (Default: 0)
            name: Name of the module (DEFAULT: None)
        """
        # Call constructor of the parent class.
        TrainableNM.__init__(self, name=name)

        # Get input size.
        self._input_size = input_size
        if type(self._input_size) == list:
            if len(self._input_size) == 1:
                self._input_size = self._input_size[0]
            else:
                raise ConfigurationError(
                    "'input_size' must be a single value (received {})".format(
                        self._input_size))

        # Get input/output dimensions, i.e. number of axes of the input [BATCH_SIZE x ... x INPUT_SIZE].
        # The module will "broadcast" over those dimensions.
        self._dimensions = dimensions
        if self._dimensions < 2:
            raise ConfigurationError(
                "'dimensions' must be bigger than two  (received {})".format(
                    self._dimensions))

        # Get output (prediction/logits) size.
        self._output_size = output_size
        if type(self._output_size) == list:
            if len(self._output_size) == 1:
                self._output_size = self._output_size[0]
            else:
                raise ConfigurationError(
                    "'output_size' must be a single value (received {})".
                    format(self._output_size))

        logging.info(
            "Initializing network with input size = {} and output size = {}".
            format(self._input_size, self._output_size))

        # Create the module list.
        modules = []

        # Retrieve number of hidden layers, along with their sizes (numbers of hidden neurons from configuration).
        if type(hidden_sizes) == list:
            # Stack linear layers.
            input_dim = self._input_size
            for hidden_dim in hidden_sizes:
                # Add linear layer.
                modules.append(torch.nn.Linear(input_dim, hidden_dim))
                # Add activation.
                modules.append(torch.nn.ReLU())
                # Add dropout.
                if dropout_rate > 0:
                    modules.append(torch.nn.Dropout(dropout_rate))
                # Remember size.
                input_dim = hidden_dim

            # Add the last output" (or in a special case: the only) layer.
            modules.append(torch.nn.Linear(input_dim, self._output_size))

            logging.info("Created {} hidden layers with sizes {}".format(
                len(hidden_sizes), hidden_sizes))

        else:
            raise ConfigurationError(
                "'hidden_sizes' must contain a list with numbers of neurons in consecutive hidden layers (received {})"
                .format(hidden_sizes))

        # Finally create the sequential model out of those modules.
        self.layers = torch.nn.Sequential(*modules)
    def __init__(
        self,
        input_depth: int,
        input_height: int,
        input_width: int,
        conv1_out_channels: int = 64,
        conv1_kernel_size: int = 3,
        conv1_stride: int = 1,
        conv1_padding: int = 0,
        maxpool1_kernel_size: int = 2,
        conv2_out_channels: int = 32,
        conv2_kernel_size: int = 3,
        conv2_stride: int = 1,
        conv2_padding: int = 0,
        maxpool2_kernel_size: int = 2,
        conv3_out_channels: int = 16,
        conv3_kernel_size: int = 3,
        conv3_stride: int = 1,
        conv3_padding: int = 0,
        maxpool3_kernel_size: int = 2,
        name: Optional[str] = None,
    ):
        """
        Constructor of the a simple CNN.

        The overall structure of this CNN is as follows:

            (Conv1 -> MaxPool1 -> ReLu) -> (Conv2 -> MaxPool2 -> ReLu) -> (Conv3 -> MaxPool3 -> ReLu)

        The parameters that the user can change are:

         - For Conv1, Conv2 & Conv3: number of output channels, kernel size, stride and padding.
         - For MaxPool1, MaxPool2 & MaxPool3: Kernel size


        .. note::

            We are using the default values of ``dilatation``, ``groups`` & ``bias`` for ``nn.Conv2D``.

            Similarly for the ``stride``, ``padding``, ``dilatation``, ``return_indices`` & ``ceil_mode`` of \
            ``nn.MaxPool2D``.

        Args: 
            input_depth: Depth of the input image
            input_height: Height of the input image
            input_width: Width of the input image
            convX_out_channels: Number of output channels of layer X (X=1,2,3)
            convX_kernel_size: Kernel size of layer X (X=1,2,3)
            convX_stride: Stride of layer X (X=1,2,3)
            convX_padding: Padding of layer X (X=1,2,3)
            name: Name of the module (DEFAULT: None)
        """
        # Call base constructor.
        TrainableNM.__init__(self, name=name)

        # Get input image information from the global parameters.
        self._input_depth = input_depth
        self._input_height = input_height
        self._input_width = input_width

        # Retrieve the Conv1 parameters.
        self._conv1_out_channels = conv1_out_channels
        self._conv1_kernel_size = conv1_kernel_size
        self._conv1_stride = conv1_stride
        self._conv1_padding = conv1_padding

        # Retrieve the MaxPool1 parameter.
        self._maxpool1_kernel_size = maxpool1_kernel_size

        # Retrieve the Conv2 parameters.
        self._conv2_out_channels = conv2_out_channels
        self._conv2_kernel_size = conv2_kernel_size
        self._conv2_stride = conv2_stride
        self._conv2_padding = conv2_padding

        # Retrieve the MaxPool2 parameter.
        self._maxpool2_kernel_size = maxpool2_kernel_size

        # Retrieve the Conv3 parameters.
        self._conv3_out_channels = conv3_out_channels
        self._conv3_kernel_size = conv3_kernel_size
        self._conv3_stride = conv3_stride
        self._conv3_padding = conv3_padding

        # Retrieve the MaxPool3 parameter.
        self._maxpool3_kernel_size = maxpool3_kernel_size

        # We can compute the spatial size of the output volume as a function of the input volume size (W),
        # the receptive field size of the Conv Layer neurons (F), the stride with which they are applied (S),
        # and the amount of zero padding used (P) on the border.
        # The corresponding equation is conv_size = ((W−F+2P)/S)+1.

        # doc for nn.Conv2D: https://pytorch.org/docs/stable/nn.html#torch.nn.Conv2d
        # doc for nn.MaxPool2D: https://pytorch.org/docs/stable/nn.html#torch.nn.MaxPool2d

        # ----------------------------------------------------
        # Conv1
        self._conv1 = nn.Conv2d(
            in_channels=self._input_depth,
            out_channels=self._conv1_out_channels,
            kernel_size=self._conv1_kernel_size,
            stride=self._conv1_stride,
            padding=self._conv1_padding,
            dilation=1,
            groups=1,
            bias=True,
        )

        width_features_conv1 = np.floor((
            (self._input_width - self._conv1_kernel_size +
             2 * self._conv1_padding) / self._conv1_stride) + 1)
        height_features_conv1 = np.floor((
            (self._input_height - self._conv1_kernel_size +
             2 * self._conv1_padding) / self._conv1_stride) + 1)

        # ----------------------------------------------------
        # MaxPool1
        self._maxpool1 = nn.MaxPool2d(kernel_size=self._maxpool1_kernel_size)

        width_features_maxpool1 = np.floor((
            (width_features_conv1 - self._maxpool1_kernel_size +
             2 * self._maxpool1.padding) / self._maxpool1.stride) + 1)

        height_features_maxpool1 = np.floor((
            (height_features_conv1 - self._maxpool1_kernel_size +
             2 * self._maxpool1.padding) / self._maxpool1.stride) + 1)

        # ----------------------------------------------------
        # Conv2
        self._conv2 = nn.Conv2d(
            in_channels=self._conv1_out_channels,
            out_channels=self._conv2_out_channels,
            kernel_size=self._conv2_kernel_size,
            stride=self._conv2_stride,
            padding=self._conv2_padding,
            dilation=1,
            groups=1,
            bias=True,
        )

        width_features_conv2 = np.floor((
            (width_features_maxpool1 - self._conv2_kernel_size +
             2 * self._conv2_padding) / self._conv2_stride) + 1)
        height_features_conv2 = np.floor((
            (height_features_maxpool1 - self._conv2_kernel_size +
             2 * self._conv2_padding) / self._conv2_stride) + 1)

        # ----------------------------------------------------
        # MaxPool2
        self._maxpool2 = nn.MaxPool2d(kernel_size=self._maxpool2_kernel_size)

        width_features_maxpool2 = np.floor((
            (width_features_conv2 - self._maxpool2_kernel_size +
             2 * self._maxpool2.padding) / self._maxpool2.stride) + 1)
        height_features_maxpool2 = np.floor((
            (height_features_conv2 - self._maxpool2_kernel_size +
             2 * self._maxpool2.padding) / self._maxpool2.stride) + 1)

        # ----------------------------------------------------
        # Conv3
        self._conv3 = nn.Conv2d(
            in_channels=self._conv2_out_channels,
            out_channels=self._conv3_out_channels,
            kernel_size=self._conv3_kernel_size,
            stride=self._conv3_stride,
            padding=self._conv3_padding,
            dilation=1,
            groups=1,
            bias=True,
        )

        width_features_conv3 = np.floor((
            (width_features_maxpool2 - self._conv3_kernel_size +
             2 * self._conv3_padding) / self._conv3_stride) + 1)
        height_features_conv3 = np.floor((
            (height_features_maxpool2 - self._conv3_kernel_size +
             2 * self._conv3_padding) / self._conv3_stride) + 1)

        # ----------------------------------------------------
        # MaxPool3
        self._maxpool3 = nn.MaxPool2d(kernel_size=self._maxpool3_kernel_size)

        width_features_maxpool3 = np.floor((
            (width_features_conv3 - self._maxpool3_kernel_size +
             2 * self._maxpool3.padding) / self._maxpool3.stride) + 1)

        height_features_maxpool3 = np.floor((
            (height_features_conv3 - self._maxpool1_kernel_size +
             2 * self._maxpool3.padding) / self._maxpool3.stride) + 1)

        # Rememvber the output dims.
        self._feature_map_height = height_features_maxpool3
        self._feature_map_width = width_features_maxpool3
        self._feature_map_depth = self._conv3_out_channels

        # Log info about dimensions.
        logging.info('Input shape: [-1, {}, {}, {}]'.format(
            self._input_depth, self._input_height, self._input_width))
        logging.debug('Computed output shape of each layer:')
        logging.debug('  * Conv1: [-1, {}, {}, {}]'.format(
            self._conv1_out_channels, height_features_conv1,
            width_features_conv1))
        logging.debug('  * MaxPool1: [-1, {}, {}, {}]'.format(
            self._conv1_out_channels, height_features_maxpool1,
            width_features_maxpool1))
        logging.debug('  * Conv2: [-1, {}, {}, {}]'.format(
            self._conv2_out_channels, height_features_conv2,
            width_features_conv2))
        logging.debug('  * MaxPool2: [-1, {}, {}, {}]'.format(
            self._conv2_out_channels, height_features_maxpool2,
            width_features_maxpool2))
        logging.debug('  * Conv3: [-1, {}, {}, {}]'.format(
            self._conv3_out_channels,
            height_features_conv3,
            width_features_conv3,
        ))
        logging.debug('  * MaxPool3: [-1, {}, {}, {}]'.format(
            self._conv3_out_channels, width_features_maxpool3,
            height_features_maxpool3))
        logging.info('Output shape: [-1, {}, {}, {}]'.format(
            self._feature_map_depth, self._feature_map_height,
            self._feature_map_width))
    def __init__(
        self,
        model_type: str,
        output_size: Optional[int] = None,
        return_feature_maps: bool = False,
        pretrained: bool = False,
        name: Optional[str] = None,
    ):
        """
        Initializes the ``ImageEncoder`` model, creates the required "backbone".

        Args:
            model_type: Type of backbone (Options: VGG16 | DenseNet121 | ResNet152 | ResNet50)
            output_size: Size of the output layer (Optional, Default: None)
            return_feature_maps: Return mode: image embeddings vs feature maps (Default: False)
            pretrained: Loads pretrained model (Default: False)
            name: Name of the module (DEFAULT: None)
        """
        TrainableNM.__init__(self, name=name)

        # Get operation modes.
        self._return_feature_maps = return_feature_maps

        # Get model type.
        self._model_type = get_value_from_dictionary(
            model_type, "vgg16 | densenet121 | resnet152 | resnet50".split(" | ")
        )

        # Get output size (optional - not in feature_maps).
        self._output_size = output_size

        if self._model_type == 'vgg16':
            # Get VGG16
            self._model = models.vgg16(pretrained=pretrained)

            if self._return_feature_maps:
                # Use only the "feature encoder".
                self._model = self._model.features

                # Remember the output feature map dims.
                self._feature_map_height = 7
                self._feature_map_width = 7
                self._feature_map_depth = 512

            else:
                # Use the whole model, but "reshape"/reinstantiate the last layer ("FC6").
                self._model.classifier._modules['6'] = torch.nn.Linear(4096, self._output_size)

        elif self._model_type == 'densenet121':
            # Get densenet121
            self._model = models.densenet121(pretrained=pretrained)

            if self._return_feature_maps:
                raise ConfigurationError("'densenet121' doesn't support 'return_feature_maps' mode (yet)")

            # Use the whole model, but "reshape"/reinstantiate the last layer ("FC6").
            self._model.classifier = torch.nn.Linear(1024, self._output_size)

        elif self._model_type == 'resnet152':
            # Get resnet152
            self._model = models.resnet152(pretrained=pretrained)

            if self._return_feature_maps:
                # Get all modules exluding last (avgpool) and (fc)
                modules = list(self._model.children())[:-2]
                self._model = torch.nn.Sequential(*modules)

                # Remember the output feature map dims.
                self._feature_map_height = 7
                self._feature_map_width = 7
                self._feature_map_depth = 2048

            else:
                # Use the whole model, but "reshape"/reinstantiate the last layer ("FC6").
                self._model.fc = torch.nn.Linear(2048, self._output_size)

        elif self._model_type == 'resnet50':
            # Get resnet50
            self._model = models.resnet50(pretrained=pretrained)

            if self._return_feature_maps:
                # Get all modules exluding last (avgpool) and (fc)
                modules = list(self._model.children())[:-2]
                self._model = torch.nn.Sequential(*modules)

                # Remember the output feature map dims.
                self._feature_map_height = 7
                self._feature_map_width = 7
                self._feature_map_depth = 2048

            else:
                # Use the whole model, but "reshape"/reinstantiate the last layer ("FC6").
                self._model.fc = torch.nn.Linear(2048, self._output_size)
Beispiel #29
0
    def __str__(self):
        name = TrainableNM.__str__(self)

        if self.name:
            name = self.name + name
        return name