def __init__(self, embed_size, cate_dim, args): super(NeRTModel, self).__init__() hidden_size = args.hidden_size num_layers = args.num_layers self.rnn = nn.LSTM(embed_size, int(hidden_size / 2), num_layers, batch_first=True, bidirectional=True) #self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) #self.mlp = nn.Linear(hidden_size*2, embed_size) self.mlp = nn.Sequential( nn.Linear(hidden_size * 2, hidden_size * 11 // 10), nn.ReLU(inplace=True), nn.Linear(hidden_size * 11 // 10, hidden_size * 8 // 10), nn.ReLU(inplace=True), nn.Linear(hidden_size * 8 // 10, hidden_size * 7 // 10), nn.ReLU(inplace=True), nn.Linear(hidden_size * 7 // 10, hidden_size * 6 // 10), nn.ReLU(inplace=True), nn.Linear(hidden_size * 6 // 10, embed_size)) self._dropout = nn.Dropout(0.3) self._W_attn = nn.Parameter(torch.zeros([hidden_size * 2, 1], dtype=torch.float32), requires_grad=True) self._b_attn = nn.Parameter(torch.zeros([1], dtype=torch.float32), requires_grad=True) self._mha = nn.MultiheadAttention(embed_size, 20 if (embed_size % 20) == 0 else 10) self._mlp_mha = nn.Linear(embed_size, hidden_size) nn.init.xavier_normal_(self._W_attn.data) self._rnn_hidden_size = hidden_size
def __init__(self, n_stacks, n_dims, n_heads, seq_len, n_multihead=1, dropout=0.0): super(StackedNMultiHeadAttention, self).__init__() self.n_stacks = n_stacks self.n_multihead = n_multihead self.n_dims = n_dims self.norm_layers = nn.LayerNorm(n_dims) # n_stacks has n_multiheads each self.multihead_layers = nn.ModuleList(n_stacks * [ nn.ModuleList(n_multihead * [ nn.MultiheadAttention( embed_dim=n_dims, num_heads=n_heads, dropout=dropout), ]), ]) self.ffn = nn.ModuleList(n_stacks * [FFN(n_dims)]) self.mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).to(dtype=torch.bool)
def __init__(self, input_dim, hidden_dim, ff_dim=2048, heads=1, max_len=5000, dropout=0, rnn=False, attn=False, residual=False): super().__init__() self.attn = nn.MultiheadAttention(input_dim, heads, dropout=dropout) if attn else None self.ln = BlockNorm(input_dim) self.dropout = nn.Dropout(dropout) self.rnn = nn.LSTM(input_dim, input_dim) if rnn else None self.ff = Boom(input_dim, dropout=dropout, hidden_dim=int(ff_dim * 2), shortcut=True) self.residual = residual self.max_len = max_len
def __init__(self, n_skill, max_seq=100, embed_dim=128): super(SAKTModel, self).__init__() self.n_skill = n_skill self.embed_dim = embed_dim self.embedding = nn.Embedding(2 * n_skill + 1, embed_dim) self.pos_embedding1 = nn.Embedding(max_seq - 1, embed_dim) self.pos_embedding2 = nn.Embedding(max_seq - 1, embed_dim) self.tsdiff_embedding = nn.Embedding(301, embed_dim) self.elptime_embedding = nn.Embedding(301, embed_dim) self.e_embedding = nn.Embedding(n_skill + 1, embed_dim) self.part_embedding = nn.Embedding(8, embed_dim) self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2) self.dropout = nn.Dropout(0.2) self.layer_normal = nn.LayerNorm(embed_dim) self.ffn = FFN(embed_dim) self.pred = nn.Linear(embed_dim, 1)
def __init__(self, n_skill, max_seq=100, embed_dim=128, num_heads=8, dropout=0.2): super(SAKTModel, self).__init__() self.n_skill = n_skill self.embed_dim = embed_dim self.embedding = nn.Embedding(2 * n_skill + 1, embed_dim) self.pos_embedding_enc = nn.Embedding(max_seq - 1, embed_dim) self.pos_embedding_dec = nn.Embedding(max_seq - 1, embed_dim) self.e_embedding = nn.Embedding(n_skill + 1, embed_dim) self.part_embedding = nn.Embedding(8, embed_dim) self.elapsed_time_embedding = nn.Embedding(302, embed_dim) self.duration_previous_content_embedding = nn.Embedding(302, embed_dim) self.multi_att_enc_self1 = SelfAttentionLayer(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout) self.multi_att_enc_self2 = SelfAttentionLayer(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout) self.multi_att_dec_self1 = SelfAttentionLayer(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout) self.multi_att_dec_self2 = SelfAttentionLayer(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout) self.multi_att_dec = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout) self.dropout = nn.Dropout(0.2) self.layer_normal = nn.LayerNorm(embed_dim) self.ffn = FFN(embed_dim) self.pred = nn.Linear(embed_dim, 1)
def __init__(self, config:dict) -> None: super(Model, self).__init__() self.word_embedding = config["embedding"] if not config["freeze_embedding"]: self.word_embedding.requires_grad_(True) else: self.word_embedding.requires_grad_(False) self.word_encoder = DynamicRNN(config["embedding_dim"], hidden_size=config["text_hidden_size"], num_layers=config["text_layers"], dropout=config["dropout"], bias_init=config["bias_init"], batch_first=True, bidirectional=True, rnn_type="GRU") self.word_output_size = config["text_hidden_size"] * config["text_layers"] * 2 self.img_fc = nn.Linear(config["img_input_size"], self.word_output_size) self.tanh1 = nn.Tanh() self.attn = nn.MultiheadAttention(self.word_output_size, config["attention_nhead"]) self.fusion_encoder = MultiheadAttentionEncoder(self.word_output_size, config["fusion_nheads"], config["uniform_bound"]) self.output_layer = OutputLayer(config["task"], self.word_output_size, config["output_size"], config["dropout"])
def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1, activation: str = "relu", normalize_before: bool = True) -> None: super(TransformerEncoderLayer, self).__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before
def __init__(self, d_model=512, nhead=8, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerEncoderLayer, self).__init__() # there are total 8 attentions, so each attention head dim will be d_model // nhead, # MultiheadAttention will take care of that self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) ## Feed Forward layer is nothing but 2 linear layers that take the dims from 512->2048->512 self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, input_dim, lstm_hidden_dim, lstm_layers, lstm_dropout, word_pad_idx, attn_heads=None, attn_dropout=None): super().__init__() self.word_pad_idx = word_pad_idx #biLSTM layer self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_hidden_dim, num_layers=lstm_layers, bidirectional=True, dropout=lstm_dropout if lstm_layers > 1 else 0) #attention layer self.attn_heads = attn_heads if self.attn_heads: self.attn = nn.MultiheadAttention(embed_dim=lstm_hidden_dim * 2, num_heads=attn_heads, dropout=attn_dropout)
def __init__(self, config, bert_hidden_states=4, num_heads=1, dropout=0.1): config = deepcopy(config) config.output_hidden_states = True super(BertForQuestionAnswering2, self).__init__(config) self.num_labels = config.num_labels self.bert_hidden_states = bert_hidden_states self.bert = BertModel(config) #config.num_labels = 1 num_labels = 1 self.qa_outputs = nn.Linear( config.hidden_size * self.bert_hidden_states * 2, num_labels) self.qa_attn = nn.MultiheadAttention(config.hidden_size * self.bert_hidden_states, num_heads=num_heads, dropout=dropout) self.sm = nn.Sigmoid() self.init_weights() self.qa_outputs.bias.data.fill_(1.0)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, faster=False, use_linear_attention=False): super().__init__() self.faster = faster self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model if self.faster: self.linear1 = nn.Linear(d_model, dim_feedforward // 4) self.dropout = nn.Dropout(dropout, inplace=True) self.linear2 = nn.Linear(dim_feedforward // 4, d_model) else: self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout, inplace=True) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout, inplace=True) self.dropout2 = nn.Dropout(dropout, inplace=True) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before
def __init__(self, config): super().__init__() self.config = config self.att = nn.MultiheadAttention( embed_dim=config.n_embd, num_heads=config.n_head, dropout=config.attn_pdrop, ) assert config.n_embd % config.n_head == 0 # key, query, value projections for all heads self.key = nn.Linear(config.n_embd, config.n_embd) self.query = nn.Linear(config.n_embd, config.n_embd) self.value = nn.Linear(config.n_embd, config.n_embd) # regularization self.resid_drop = nn.Dropout(config.resid_pdrop) # output projection self.proj = nn.Linear(config.n_embd, config.n_embd) # causal mask to ensure that attention is only applied to the left in the input sequence # Here dtype=bool, torch.triu unstead of tril and diagonal=1 are VERY important mask_mat = torch.ones(config.block_size, config.block_size, dtype=torch.bool) self.register_buffer("mask", torch.triu(mask_mat, diagonal=1))
def __init__(self, dim_model, heads_en, total_ex, total_cat, total_tg, seq_len): super().__init__() self.seq_len = seq_len - 1 self.total_cat = total_cat self.embd_ex = nn.Embedding(total_ex, embedding_dim=dim_model) # self.embd_cat = nn.Embedding(total_cat + 1, embedding_dim=dim_model) self.embd_tg = nn.Embedding(total_tg + 1, embedding_dim=dim_model) self.embd_pos = nn.Embedding(seq_len, embedding_dim=dim_model) self.pos_norm = nn.LayerNorm(dim_model, eps=1.0e-12) self.dt_fc = nn.Linear(1, dim_model, bias=False) self.cat_fc = nn.Linear(total_cat + 1, dim_model) self.cate_proj = nn.Sequential( nn.Linear(dim_model * 5, dim_model), nn.LayerNorm(dim_model), ) self.multi_en = nn.MultiheadAttention(embed_dim=dim_model, num_heads=heads_en) self.ffn_en = Feed_Forward_block(dim_model) self.layer_norm1 = nn.LayerNorm(dim_model) self.layer_norm2 = nn.LayerNorm(dim_model)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False): """ Parameters ---------- d_model : {int, scalar} dim of the model input and output(default: 512) nhead : {int, scalar} parallel attention heads(default: 8) dim_feedforward : {int, scalar} FFN layer hidden neurons(default: 2048) dropout : {float, scalar} a Dropout layer on attn_output_weights.(default: 0.1) activation : {str-like, scalar} ("relu"(default), "gelu", "glu") normalize_before : {bool, scalar} False(default), Norm Layer whether before SA or FFN """ super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before
def __init__(self, language, device, embed_dim, hidden_dim, num_pos, num_embed, num_heads, num_layers, dropout, n_classes): super().__init__() self.device = device self.language = language self.max_seq_len = num_pos # self.w_embedding = nn.Embedding(self.language.n_words, embed_dim) # glove vectors embed_dim = 300 self.w_embedding = glove_embeddings(trainable=True) self.pos_embeddings = nn.Embedding(num_pos, embed_dim) # self.attention = TaskAttention(device, dropout) # self.t_embedding = nn.Embedding(num_layers, embed_dim) # self.t_embedding.requires_grad = False # self.ff_embedding = nn.Embedding(num_layers, embed_dim) # self.ff_embedding.requires_grad = False self.dropout = nn.Dropout(dropout) self.mhas = nn.ModuleList() self.ff = nn.ModuleList() self.ln_1, self.ln_2 = nn.ModuleList(), nn.ModuleList() self.tasks = [] for i in range(num_layers): self.mhas.append( nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)) # self.mhas.append(nn.MultiheadAttention(embed_dim, num_heads, dropout=0)) self.ff.append( nn.Sequential(nn.Linear(embed_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, embed_dim))) self.ln_1.append(nn.LayerNorm(embed_dim, eps=1e-12)) self.ln_2.append(nn.LayerNorm(embed_dim, eps=1e-12)) self.tasks.append(i) self.classify = nn.Linear(embed_dim, n_classes)
def __init__(self, num_conv_blocks, kernel_size, num_heads=4, d_model=128, dropout=0.1, device="cuda:0"): super(StackedEncoder, self).__init__() # self.pos_encoder = PositionalEncoding(d_model, dropout, device) # self.pos_norm = nn.LayerNorm(d_model) self.conv_blocks = nn.ModuleList([ DepthwiseSeparableConv(d_model, d_model, kernel_size) for _ in range(num_conv_blocks) ]) self.conv_norm = nn.ModuleList( [nn.LayerNorm(d_model) for _ in range(num_conv_blocks)]) self.self_attn_block = nn.MultiheadAttention(d_model, num_heads, dropout) # self.ffn_block = FFNBlock(d_model) self.ffn_1 = Initialized_Conv1d(d_model, d_model, relu=True, bias=True) self.ffn_1_norm = nn.LayerNorm(d_model) self.ffn_2 = Initialized_Conv1d(d_model, d_model, bias=True) self.ffn_2_norm = nn.LayerNorm(d_model) '''self.conv_norm = nn.ModuleList([nn.LayerNorm(d_model) for _ in range(num_conv_blocks)]) # self.self_attn_block = nn.MultiheadAttention(d_model, num_heads, dropout) self.self_attn_block = MultiheadAttentionLayer( d_model, num_heads, device) self.ffn_block = nn.Linear(d_model, d_model) self.ffn_norm = nn.LayerNorm(d_model)''' self.num_conv_blocks = num_conv_blocks self.dropout = dropout
def __init__(self, hidden_dim, filter_size, dropout_rate, vocab_size, embedding_dim, pre_trained_embedding=None): super().__init__() self.hidden_dim = hidden_dim self.filter_size = filter_size self.dropout_rate = dropout_rate self.embedding_dim = embedding_dim if pre_trained_embedding is None: self.vocab_size = vocab_size self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=0) else: self.embedding = nn.Embedding.from_pretrained(pre_trained_embedding, freeze=False, padding_idx=0) self.self_attention1 = nn.MultiheadAttention(self.embedding_dim, 4) self.layer_norm1 = nn.LayerNorm(self.embedding_dim) self.relu = nn.ReLU() self.dropout = nn.Dropout(self.dropout_rate) self.conv1d = nn.Conv1d(self.embedding_dim, self.hidden_dim, self.filter_size) self.bi_rnn = nn.LSTM(self.hidden_dim, int(self.hidden_dim / 2), batch_first=False, bidirectional=True) self.uni_rnn = nn.LSTM(self.hidden_dim, self.hidden_dim, batch_first=False) self.max_pool = nn.AdaptiveAvgPool2d((1, self.hidden_dim)) self.linear = nn.Linear(self.hidden_dim, 1) self.sigmoid = nn.Sigmoid()
def test_auto_wrap_preset_force_leaf(self, wrap_method): """ Test to ensure force-leaf modules are not wrapped, and children are not wrapped. The default_auto_wrap_policy forces leaf modules of type {nn.MultiheadAttention} to not be wrapped """ sequential = nn.Sequential(nn.Linear(10, 10), nn.MultiheadAttention(100, 1)) my_auto_wrap_policy = functools.partial(default_auto_wrap_policy, min_num_params=40) if wrap_method == WrapMethod.WRAP_API: with enable_wrap(wrapper_cls=FSDP, process_group=self.process_group): model = auto_wrap(sequential, auto_wrap_policy=my_auto_wrap_policy) else: assert wrap_method == WrapMethod.FSDP_CTOR model = FSDP(sequential, process_group=self.process_group, fsdp_auto_wrap_policy=my_auto_wrap_policy) self.assertTrue(isinstance(model.module[0], FSDP)) # Assert children of multihead attention are not wrapped self.assertTrue(isinstance(model.module[1], nn.MultiheadAttention)) self.assertTrue(isinstance(model.module[1].out_proj, nn.Linear))
def __init__(self, opt, n_node): super(StarSessionGraph, self).__init__() self.hidden_size = opt.hiddenSize self.n_node = n_node self.batch_size = opt.batchSize self.nonhybrid = opt.nonhybrid self.num_heads = opt.heads self.embedding = nn.Embedding(self.n_node, self.hidden_size) self.gnn = StarGNN(self.hidden_size, step=opt.step) self.attn = nn.MultiheadAttention(self.hidden_size, 1) self.linear_one = nn.Linear(self.hidden_size, self.hidden_size, bias=True) self.linear_two = nn.Linear(self.hidden_size, self.hidden_size, bias=True) self.linear_three = nn.Linear(self.hidden_size, self.num_heads, bias=False) self.linear_four = nn.Linear(self.hidden_size, self.hidden_size) self.linear_transform = nn.Linear(self.hidden_size * (self.num_heads+1), self.hidden_size, bias=True) self.layernorm1 = nn.LayerNorm(self.hidden_size) self.layernorm2 = nn.LayerNorm(self.hidden_size) self.layernorm3 = nn.LayerNorm(self.hidden_size) self.loss_function = nn.CrossEntropyLoss() # self.loss_function = nn.NLLLoss() # self.optimizer = torch.optim.Adam(self.parameters(), lr=opt.lr, weight_decay=opt.l2) # self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=opt.lr_dc_step, gamma=opt.lr_dc) self.reset_parameters()
def __init__(self, args): super(model_a, self).__init__() self.args = args # Encode state and action self.state_embedding = nn.Linear(args.state_dimension, args.state_embedding_dimension) self.action_embedding = nn.Linear(args.action_dimension, args.action_embedding_dimension) # Attention state_action_embedding_dim = args.state_embedding_dimension + args.action_embedding_dimension self.q_projection = nn.Linear(state_action_embedding_dim, state_action_embedding_dim) self.v_projection = nn.Linear(state_action_embedding_dim, state_action_embedding_dim) self.k_projection = nn.Linear(state_action_embedding_dim, state_action_embedding_dim) self.attention = nn.MultiheadAttention(state_action_embedding_dim, args.n_heads) self.predict = nn.Linear(state_action_embedding_dim, args.state_dimension)
def __init__(self, params): super(UserMemoryEmbedder, self).__init__() self.params = params self.host = torch.cuda if params["use_gpu"] else torch self.categories = ["focus", "database", "memory"] self.category_embeds = {} for position in self.categories: pos_parameter = torch.randn(params["word_embed_size"]) if params["use_gpu"]: pos_parameter = pos_parameter.cuda() pos_parameter = nn.Parameter(pos_parameter) self.category_embeds[position] = pos_parameter # Register the parameter for training/saving. self.register_parameter(position, pos_parameter) self.category_state = None # Project multimodal embedding to same size as encoder. input_size = params["asset_feature_size"] + params["word_embed_size"] if params["text_encoder"] == "lstm": output_size = params["hidden_size"] else: output_size = params["word_embed_size"] self.multimodal_embed_net = nn.Linear(input_size, output_size) self.multimodal_attend = nn.MultiheadAttention(output_size, 1)
def __init__(self, language, device, embed_dim, hidden_dim, num_embed, num_pos, num_heads, num_layers, dropout, n_classes): super().__init__() self.device = device self.language = language self.encoder = TransformerEmbedder(embed_dim, num_embed, num_pos, dropout) self.dropout = nn.Dropout(dropout) self.attentions, self.feed_forwards = nn.ModuleList(), nn.ModuleList() self.ln_1, self.ln_2 = nn.ModuleList(), nn.ModuleList() for _ in range(num_layers): self.attentions.append( nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)) self.feed_forwards.append( nn.Sequential(nn.Linear(embed_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, embed_dim))) self.ln_1.append(nn.LayerNorm(embed_dim, eps=1e-12)) self.ln_2.append(nn.LayerNorm(embed_dim, eps=1e-12)) self.classify = nn.Linear(embed_dim, n_classes)
def __init__(self, args): super(model_b, self).__init__() self.args = args # Encode the inputs self.observation_embedding = nn.Linear( args.observation_dimension, args.observation_embedding_dimension) self.action_embedding = nn.Linear(args.action_dimension, args.action_embedding_dimension) # Multi-head attention observation_action_embedding_dim = args.observation_embedding_dimension + args.action_embedding_dimension self.q_projection = nn.Linear(observation_action_embedding_dim, observation_action_embedding_dim) self.v_projection = nn.Linear(observation_action_embedding_dim, observation_action_embedding_dim) self.k_projection = nn.Linear(observation_action_embedding_dim, observation_action_embedding_dim) self.attention = nn.MultiheadAttention( observation_action_embedding_dim, args.n_heads) self.predict = nn.Linear(observation_action_embedding_dim, args.observation_dimension)
def __init__( self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) assert not normalize_before, "normalize_before is not supported"
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, return_atten_map=False): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self.return_atten_map = return_atten_map
def __init__(self, input_size, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False): super().__init__() self.self_attn = nn.MultiheadAttention(input_size, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(input_size, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, input_size) self.norm1 = nn.LayerNorm(input_size) self.norm2 = nn.LayerNorm(input_size) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before
def __init__( self, nhead, d_model, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None, ): super().__init__() self.att = nn.MultiheadAttention( embed_dim=d_model, num_heads=nhead, dropout=dropout, bias=bias, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, kdim=kdim, vdim=vdim, )
def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4): super().__init__() # cross attention self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) # self attention self.self_attn = nn.MultiheadAttention( d_model, n_heads, dropout=dropout) self.dropout2 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm(d_model) # ffn self.linear1 = nn.Linear(d_model, d_ffn) self.activation = _get_activation_fn(activation) self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ffn, d_model) self.dropout4 = nn.Dropout(dropout) self.norm3 = nn.LayerNorm(d_model)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super().__init__() self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, config, dataset): super(AutoInt, self).__init__(config, dataset) # load parameters info self.attention_size = config['attention_size'] self.dropout_probs = config['dropout_probs'] self.n_layers = config['n_layers'] self.num_heads = config['num_heads'] self.mlp_hidden_size = config['mlp_hidden_size'] self.has_residual = config['has_residual'] # define layers and loss self.att_embedding = nn.Linear(self.embedding_size, self.attention_size) self.embed_output_dim = self.num_feature_field * self.embedding_size self.atten_output_dim = self.num_feature_field * self.attention_size size_list = [self.embed_output_dim] + self.mlp_hidden_size self.mlp_layers = MLPLayers(size_list, dropout=self.dropout_probs[1]) # multi-head self-attention network self.self_attns = nn.ModuleList([ nn.MultiheadAttention(self.attention_size, self.num_heads, dropout=self.dropout_probs[0]) for _ in range(self.n_layers) ]) self.attn_fc = torch.nn.Linear(self.atten_output_dim, 1) self.deep_predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1) if self.has_residual: self.v_res_res_embedding = torch.nn.Linear(self.embedding_size, self.attention_size) self.dropout_layer = nn.Dropout(p=self.dropout_probs[2]) self.sigmoid = nn.Sigmoid() self.loss = nn.BCELoss() # parameters initialization self.apply(self._init_weights)