Example #1
0
 def __init__(
     self,
     num_hidden_layers: int,
     hidden_size: int,
     num_attention_heads: int,
     intermediate_size: int,
     attention_probs_dropout_prob: float,
     hidden_dropout_prob: float,
     hidden_act: str = "relu",
 ) -> None:
     super().__init__()
     self.layers = nn.ModuleList([
         OFEncoderLayer(
             hidden_size,
             num_attention_heads,
             intermediate_size,
             attention_probs_dropout_prob,
             hidden_dropout_prob,
             hidden_act,
         ) for _ in range(num_hidden_layers)
     ])
Example #2
0
    def __init__(
        self,
        d_model=256,
        n_heads=4,
        d_ff=2048,
        n_blocks=6,
        pos_dropout=0.0,
        slf_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        normalize_before=False,
        concat_after=False,
        relative_positional=False,
        activation="relu",
    ):
        super(TransformerEncoder, self).__init__()

        self.normalize_before = normalize_before
        self.relative_positional = relative_positional

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                n_heads,
                d_model,
                d_ff,
                slf_attn_dropout,
                ffn_dropout,
                residual_dropout=residual_dropout,
                normalize_before=normalize_before,
                concat_after=concat_after,
                relative_positional=relative_positional,
                activation=activation,
            ) for _ in range(n_blocks)
        ])

        if self.normalize_before:
            self.norm = nn.LayerNorm(d_model)
Example #3
0
    def __init__(self, params):
        super(TransformerLanguageModel, self).__init__(params)

        self.model_type = "transformer_lm"
        self.normalize_before = False
        self.smoothing = params["smoothing"]
        self.vocab_size = params["vocab_size"]
        self.num_blocks = params["num_blocks"]

        self.embedding = nn.Embedding(self.vocab_size, params["d_model"])
        self.pos_embedding = PositionalEncoding(params["d_model"], 0.0)

        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                params["n_heads"],
                params["d_model"],
                params["d_ff"],
                slf_attn_dropout=0.0,
                ffn_dropout=0.0,
                residual_dropout=params["residual_dropout"],
                normalize_before=False,
                concat_after=False,
                activation="glu",
            ) for _ in range(self.num_blocks)
        ])

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(params["d_model"])

        self.output_project = nn.Linear(params["d_model"], self.vocab_size)

        if params["share_embedding"]:
            self.output_project.weight = self.embedding.weight
            print("Share the weight of embedding to the output project layer!")

        self.crit = LabelSmoothingLoss(size=self.vocab_size,
                                       smoothing=self.smoothing,
                                       padding_idx=PAD)
Example #4
0
    def __init__(
        self,
        num_patches,
        emb_dim,
        mlp_dim,
        num_layers=12,
        num_heads=12,
        dropout_rate=0.1,
        attn_dropout_rate=0.0,
    ):
        super(Encoder, self).__init__()

        # positional embedding
        self.pos_embedding = PositionEmbs(num_patches, emb_dim, dropout_rate)

        # encoder blocks
        in_dim = emb_dim
        self.encoder_layers = nn.ModuleList()
        for i in range(num_layers):
            layer = EncoderBlock(in_dim, mlp_dim, num_heads, dropout_rate,
                                 attn_dropout_rate)
            self.encoder_layers.append(layer)
        self.norm = LayerNorm(in_dim)
Example #5
0
    def __init__(
        self,
        d_input,
        n_layers,
        n_head,
        d_k,
        d_v,
        d_model,
        d_inner,
        dropout=0.1,
        pe_maxlen=5000,
    ):
        super(Encoder, self).__init__()
        # parameters
        self.d_input = d_input
        self.n_layers = n_layers
        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        self.d_inner = d_inner
        self.dropout_rate = dropout
        self.pe_maxlen = pe_maxlen
        self.n_layers = n_layers

        # use linear transformation with layer norm to replace input embedding
        self.linear_in = nn.Linear(d_input, d_model)
        self.layer_norm_in = nn.LayerNorm(d_model)
        self.positional_encoding = PositionalEncoding(d_model,
                                                      max_len=pe_maxlen)
        self.dropout = nn.Dropout(dropout)

        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)
        ])
Example #6
0
    def __init__(
        self,
        num_layers,
        max_position_embeddings,
        hidden_size,
        intermediate_size,
        nheads,
        activation,
        chunk_size_feed_forward=0,
        layer_norm_eps=1e-5,
        attn_dropout=0,
        hidden_dropout=0,
        position_embedding_type="absolute",
        is_decoder=False,
        add_cross_attention=False,
    ):
        super(BertEncoder, self).__init__()
        self.add_cross_attention = add_cross_attention
        self.num_layers = num_layers

        self.layer = nn.ModuleList([
            BertLayer(
                max_position_embeddings,
                hidden_size,
                intermediate_size,
                nheads,
                activation,
                chunk_size_feed_forward,
                layer_norm_eps,
                attn_dropout,
                hidden_dropout,
                position_embedding_type,
                is_decoder,
                add_cross_attention,
            ) for _ in range(num_layers)
        ])
Example #7
0
    def __init__(
        self, model, input_size, output_size, num_experts, noisy_gating=True, k=4
    ):
        super(MoE, self).__init__()
        self.noisy_gating = noisy_gating
        self.num_experts = num_experts
        self.output_size = output_size
        self.input_size = input_size
        self.k = k

        # instantiate experts
        self.experts = nn.ModuleList([model for i in range(self.num_experts)])

        self.w_gate = nn.Parameter(
            flow.zeros(input_size, num_experts), requires_grad=True
        )
        self.w_noise = nn.Parameter(
            flow.zeros(input_size, num_experts), requires_grad=True
        )

        self.softplus = nn.Softplus()
        self.softmax = nn.Softmax(1)

        assert self.k <= self.num_experts
Example #8
0
    def __init__(self, options):
        super(SincNet, self).__init__()

        self.cnn_N_filt = options["cnn_N_filt"]
        self.cnn_len_filt = options["cnn_len_filt"]
        self.cnn_max_pool_len = options["cnn_max_pool_len"]

        self.cnn_act = options["cnn_act"]
        self.cnn_drop = options["cnn_drop"]

        self.cnn_use_laynorm = options["cnn_use_laynorm"]
        self.cnn_use_batchnorm = options["cnn_use_batchnorm"]
        self.cnn_use_laynorm_inp = options["cnn_use_laynorm_inp"]
        self.cnn_use_batchnorm_inp = options["cnn_use_batchnorm_inp"]

        self.input_dim = int(options["input_dim"])

        self.fs = options["fs"]

        self.N_cnn_lay = len(options["cnn_N_filt"])
        self.conv = nn.ModuleList([])
        self.bn = nn.ModuleList([])
        self.ln = nn.ModuleList([])
        self.act = nn.ModuleList([])
        self.drop = nn.ModuleList([])

        if self.cnn_use_laynorm_inp:
            self.ln0 = LayerNorm(self.input_dim)

        if self.cnn_use_batchnorm_inp:
            self.bn0 = nn.BatchNorm1d([self.input_dim], momentum=0.05)

        current_input = self.input_dim

        for i in range(self.N_cnn_lay):

            N_filt = int(self.cnn_N_filt[i])
            len_filt = int(self.cnn_len_filt[i])

            # dropout
            self.drop.append(nn.Dropout(p=self.cnn_drop[i]))

            # activation
            self.act.append(act_fun(self.cnn_act[i]))

            # layer norm initialization
            self.ln.append(
                LayerNorm((
                    N_filt,
                    int((current_input - self.cnn_len_filt[i] + 1) /
                        self.cnn_max_pool_len[i]),
                )))

            self.bn.append(
                nn.BatchNorm1d(
                    N_filt,
                    int((current_input - self.cnn_len_filt[i] + 1) /
                        self.cnn_max_pool_len[i]),
                    momentum=0.05,
                ))

            if i == 0:
                self.conv.append(
                    SincConv_fast(self.cnn_N_filt[0], self.cnn_len_filt[0],
                                  self.fs))
            else:
                self.conv.append(
                    nn.Conv1d(self.cnn_N_filt[i - 1], self.cnn_N_filt[i],
                              self.cnn_len_filt[i]))

            current_input = int((current_input - self.cnn_len_filt[i] + 1) /
                                self.cnn_max_pool_len[i])

        self.out_dim = current_input * N_filt
Example #9
0
    def __init__(self, options):
        super(MLP, self).__init__()

        self.input_dim = int(options["input_dim"])
        self.fc_lay = options["fc_lay"]
        self.fc_drop = options["fc_drop"]
        self.fc_use_batchnorm = options["fc_use_batchnorm"]
        self.fc_use_laynorm = options["fc_use_laynorm"]
        self.fc_use_laynorm_inp = options["fc_use_laynorm_inp"]
        self.fc_use_batchnorm_inp = options["fc_use_batchnorm_inp"]
        self.fc_act = options["fc_act"]

        self.wx = nn.ModuleList([])
        self.bn = nn.ModuleList([])
        self.ln = nn.ModuleList([])
        self.act = nn.ModuleList([])
        self.drop = nn.ModuleList([])

        # input layer normalization
        if self.fc_use_laynorm_inp:
            self.ln0 = LayerNorm(self.input_dim)

        # input batch normalization
        if self.fc_use_batchnorm_inp:
            self.bn0 = nn.BatchNorm1d([self.input_dim], momentum=0.05)

        self.N_fc_lay = len(self.fc_lay)

        current_input = self.input_dim

        # Initialization of hidden layers
        for i in range(self.N_fc_lay):

            # dropout
            self.drop.append(nn.Dropout(p=self.fc_drop[i]))

            # activation
            self.act.append(act_fun(self.fc_act[i]))

            add_bias = True

            # layer norm initialization
            self.ln.append(LayerNorm(self.fc_lay[i]))
            self.bn.append(nn.BatchNorm1d(self.fc_lay[i], momentum=0.05))

            if self.fc_use_laynorm[i] or self.fc_use_batchnorm[i]:
                add_bias = False

            # Linear operations
            self.wx.append(nn.Linear(current_input, self.fc_lay[i], bias=add_bias))

            # weight initialization
            self.wx[i].weight = nn.Parameter(
                flow.Tensor(self.fc_lay[i], current_input).uniform_(
                    -np.sqrt(0.01 / (current_input + self.fc_lay[i])),
                    np.sqrt(0.01 / (current_input + self.fc_lay[i])),
                )
            )
            self.wx[i].bias = nn.Parameter(flow.zeros(self.fc_lay[i]))

            current_input = self.fc_lay[i]
Example #10
0
    def __init__(
        self,
        img_size=224,
        patch_size=4,
        in_chans=3,
        num_classes=1000,
        embed_dim=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        mlp_ratio=4.0,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.1,
        norm_layer=nn.LayerNorm,
        ape=False,
        patch_norm=True,
        use_checkpoint=False,
        **kwargs,
    ):
        super().__init__()

        self.num_classes = num_classes
        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.ape = ape
        self.patch_norm = patch_norm
        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
        self.mlp_ratio = mlp_ratio

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None,
        )
        num_patches = self.patch_embed.num_patches
        patches_resolution = self.patch_embed.patches_resolution
        self.patches_resolution = patches_resolution

        # absolute position embedding
        if self.ape:
            self.absolute_pos_embed = nn.Parameter(
                flow.zeros(1, num_patches, embed_dim))
            # trunc_normal_(self.absolute_pos_embed, std=.02)
            self.absolute_pos_embed.trunc_normal_(std=0.02)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth
        # dpr = [x.item() for x in flow.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
        # TODO: here we use numpy, may have little difference with torch.linspace
        dpr = [x for x in np.linspace(0, drop_path_rate, sum(depths))
               ]  # stochastic depth decay rule

        # build layers
        self.layers = nn.ModuleList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2**i_layer),
                input_resolution=(
                    patches_resolution[0] // (2**i_layer),
                    patches_resolution[1] // (2**i_layer),
                ),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=self.mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchMerging if
                (i_layer < self.num_layers - 1) else None,
                use_checkpoint=use_checkpoint,
            )
            self.layers.append(layer)

        self.norm = norm_layer(self.num_features)
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.head = (nn.Linear(self.num_features, num_classes)
                     if num_classes > 0 else nn.Identity())

        self.apply(self._init_weights)