def __init__( self, num_hidden_layers: int, hidden_size: int, num_attention_heads: int, intermediate_size: int, attention_probs_dropout_prob: float, hidden_dropout_prob: float, hidden_act: str = "relu", ) -> None: super().__init__() self.layers = nn.ModuleList([ OFEncoderLayer( hidden_size, num_attention_heads, intermediate_size, attention_probs_dropout_prob, hidden_dropout_prob, hidden_act, ) for _ in range(num_hidden_layers) ])
def __init__( self, d_model=256, n_heads=4, d_ff=2048, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, normalize_before=False, concat_after=False, relative_positional=False, activation="relu", ): super(TransformerEncoder, self).__init__() self.normalize_before = normalize_before self.relative_positional = relative_positional self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList([ TransformerEncoderLayer( n_heads, d_model, d_ff, slf_attn_dropout, ffn_dropout, residual_dropout=residual_dropout, normalize_before=normalize_before, concat_after=concat_after, relative_positional=relative_positional, activation=activation, ) for _ in range(n_blocks) ]) if self.normalize_before: self.norm = nn.LayerNorm(d_model)
def __init__(self, params): super(TransformerLanguageModel, self).__init__(params) self.model_type = "transformer_lm" self.normalize_before = False self.smoothing = params["smoothing"] self.vocab_size = params["vocab_size"] self.num_blocks = params["num_blocks"] self.embedding = nn.Embedding(self.vocab_size, params["d_model"]) self.pos_embedding = PositionalEncoding(params["d_model"], 0.0) self.blocks = nn.ModuleList([ TransformerEncoderLayer( params["n_heads"], params["d_model"], params["d_ff"], slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=params["residual_dropout"], normalize_before=False, concat_after=False, activation="glu", ) for _ in range(self.num_blocks) ]) if self.normalize_before: self.after_norm = nn.LayerNorm(params["d_model"]) self.output_project = nn.Linear(params["d_model"], self.vocab_size) if params["share_embedding"]: self.output_project.weight = self.embedding.weight print("Share the weight of embedding to the output project layer!") self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
def __init__( self, num_patches, emb_dim, mlp_dim, num_layers=12, num_heads=12, dropout_rate=0.1, attn_dropout_rate=0.0, ): super(Encoder, self).__init__() # positional embedding self.pos_embedding = PositionEmbs(num_patches, emb_dim, dropout_rate) # encoder blocks in_dim = emb_dim self.encoder_layers = nn.ModuleList() for i in range(num_layers): layer = EncoderBlock(in_dim, mlp_dim, num_heads, dropout_rate, attn_dropout_rate) self.encoder_layers.append(layer) self.norm = LayerNorm(in_dim)
def __init__( self, d_input, n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1, pe_maxlen=5000, ): super(Encoder, self).__init__() # parameters self.d_input = d_input self.n_layers = n_layers self.n_head = n_head self.d_k = d_k self.d_v = d_v self.d_model = d_model self.d_inner = d_inner self.dropout_rate = dropout self.pe_maxlen = pe_maxlen self.n_layers = n_layers # use linear transformation with layer norm to replace input embedding self.linear_in = nn.Linear(d_input, d_model) self.layer_norm_in = nn.LayerNorm(d_model) self.positional_encoding = PositionalEncoding(d_model, max_len=pe_maxlen) self.dropout = nn.Dropout(dropout) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ])
def __init__( self, num_layers, max_position_embeddings, hidden_size, intermediate_size, nheads, activation, chunk_size_feed_forward=0, layer_norm_eps=1e-5, attn_dropout=0, hidden_dropout=0, position_embedding_type="absolute", is_decoder=False, add_cross_attention=False, ): super(BertEncoder, self).__init__() self.add_cross_attention = add_cross_attention self.num_layers = num_layers self.layer = nn.ModuleList([ BertLayer( max_position_embeddings, hidden_size, intermediate_size, nheads, activation, chunk_size_feed_forward, layer_norm_eps, attn_dropout, hidden_dropout, position_embedding_type, is_decoder, add_cross_attention, ) for _ in range(num_layers) ])
def __init__( self, model, input_size, output_size, num_experts, noisy_gating=True, k=4 ): super(MoE, self).__init__() self.noisy_gating = noisy_gating self.num_experts = num_experts self.output_size = output_size self.input_size = input_size self.k = k # instantiate experts self.experts = nn.ModuleList([model for i in range(self.num_experts)]) self.w_gate = nn.Parameter( flow.zeros(input_size, num_experts), requires_grad=True ) self.w_noise = nn.Parameter( flow.zeros(input_size, num_experts), requires_grad=True ) self.softplus = nn.Softplus() self.softmax = nn.Softmax(1) assert self.k <= self.num_experts
def __init__(self, options): super(SincNet, self).__init__() self.cnn_N_filt = options["cnn_N_filt"] self.cnn_len_filt = options["cnn_len_filt"] self.cnn_max_pool_len = options["cnn_max_pool_len"] self.cnn_act = options["cnn_act"] self.cnn_drop = options["cnn_drop"] self.cnn_use_laynorm = options["cnn_use_laynorm"] self.cnn_use_batchnorm = options["cnn_use_batchnorm"] self.cnn_use_laynorm_inp = options["cnn_use_laynorm_inp"] self.cnn_use_batchnorm_inp = options["cnn_use_batchnorm_inp"] self.input_dim = int(options["input_dim"]) self.fs = options["fs"] self.N_cnn_lay = len(options["cnn_N_filt"]) self.conv = nn.ModuleList([]) self.bn = nn.ModuleList([]) self.ln = nn.ModuleList([]) self.act = nn.ModuleList([]) self.drop = nn.ModuleList([]) if self.cnn_use_laynorm_inp: self.ln0 = LayerNorm(self.input_dim) if self.cnn_use_batchnorm_inp: self.bn0 = nn.BatchNorm1d([self.input_dim], momentum=0.05) current_input = self.input_dim for i in range(self.N_cnn_lay): N_filt = int(self.cnn_N_filt[i]) len_filt = int(self.cnn_len_filt[i]) # dropout self.drop.append(nn.Dropout(p=self.cnn_drop[i])) # activation self.act.append(act_fun(self.cnn_act[i])) # layer norm initialization self.ln.append( LayerNorm(( N_filt, int((current_input - self.cnn_len_filt[i] + 1) / self.cnn_max_pool_len[i]), ))) self.bn.append( nn.BatchNorm1d( N_filt, int((current_input - self.cnn_len_filt[i] + 1) / self.cnn_max_pool_len[i]), momentum=0.05, )) if i == 0: self.conv.append( SincConv_fast(self.cnn_N_filt[0], self.cnn_len_filt[0], self.fs)) else: self.conv.append( nn.Conv1d(self.cnn_N_filt[i - 1], self.cnn_N_filt[i], self.cnn_len_filt[i])) current_input = int((current_input - self.cnn_len_filt[i] + 1) / self.cnn_max_pool_len[i]) self.out_dim = current_input * N_filt
def __init__(self, options): super(MLP, self).__init__() self.input_dim = int(options["input_dim"]) self.fc_lay = options["fc_lay"] self.fc_drop = options["fc_drop"] self.fc_use_batchnorm = options["fc_use_batchnorm"] self.fc_use_laynorm = options["fc_use_laynorm"] self.fc_use_laynorm_inp = options["fc_use_laynorm_inp"] self.fc_use_batchnorm_inp = options["fc_use_batchnorm_inp"] self.fc_act = options["fc_act"] self.wx = nn.ModuleList([]) self.bn = nn.ModuleList([]) self.ln = nn.ModuleList([]) self.act = nn.ModuleList([]) self.drop = nn.ModuleList([]) # input layer normalization if self.fc_use_laynorm_inp: self.ln0 = LayerNorm(self.input_dim) # input batch normalization if self.fc_use_batchnorm_inp: self.bn0 = nn.BatchNorm1d([self.input_dim], momentum=0.05) self.N_fc_lay = len(self.fc_lay) current_input = self.input_dim # Initialization of hidden layers for i in range(self.N_fc_lay): # dropout self.drop.append(nn.Dropout(p=self.fc_drop[i])) # activation self.act.append(act_fun(self.fc_act[i])) add_bias = True # layer norm initialization self.ln.append(LayerNorm(self.fc_lay[i])) self.bn.append(nn.BatchNorm1d(self.fc_lay[i], momentum=0.05)) if self.fc_use_laynorm[i] or self.fc_use_batchnorm[i]: add_bias = False # Linear operations self.wx.append(nn.Linear(current_input, self.fc_lay[i], bias=add_bias)) # weight initialization self.wx[i].weight = nn.Parameter( flow.Tensor(self.fc_lay[i], current_input).uniform_( -np.sqrt(0.01 / (current_input + self.fc_lay[i])), np.sqrt(0.01 / (current_input + self.fc_lay[i])), ) ) self.wx[i].bias = nn.Parameter(flow.zeros(self.fc_lay[i])) current_input = self.fc_lay[i]
def __init__( self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.1, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, use_checkpoint=False, **kwargs, ): super().__init__() self.num_classes = num_classes self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.num_features = int(embed_dim * 2**(self.num_layers - 1)) self.mlp_ratio = mlp_ratio # split image into non-overlapping patches self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, ) num_patches = self.patch_embed.num_patches patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution # absolute position embedding if self.ape: self.absolute_pos_embed = nn.Parameter( flow.zeros(1, num_patches, embed_dim)) # trunc_normal_(self.absolute_pos_embed, std=.02) self.absolute_pos_embed.trunc_normal_(std=0.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth # dpr = [x.item() for x in flow.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule # TODO: here we use numpy, may have little difference with torch.linspace dpr = [x for x in np.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2**i_layer), input_resolution=( patches_resolution[0] // (2**i_layer), patches_resolution[1] // (2**i_layer), ), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint, ) self.layers.append(layer) self.norm = norm_layer(self.num_features) self.avgpool = nn.AdaptiveAvgPool1d(1) self.head = (nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()) self.apply(self._init_weights)