def __init__(self, config): super(Model, self).__init__() self.config = config self.use_elmo = config.use_elmo self.use_bert = config.use_bert # feature layer input_size = 0 if config.use_elmo: input_size += (3 * 1024) if config.use_bert: input_size += 768 self.query_linear = nn.Linear(input_size, config.encoding_size) self.query_lstm = nn.LSTM(input_size, config.encoding_size // 2, 2, bidirectional=True, batch_first=True) self.nodes_linear = nn.Linear(input_size, config.encoding_size) # gcn layer self.nodes_dropout = nn.Dropout(self.config.dropout) self.hidden_linears = nn.ModuleList([nn.Linear(512, 512)] * 4) self.combined_linear = nn.Linear(1024, 512) # bi_attention layer self.attention_linear = nn.Linear(512 * 3, 1, False) self.mha = attention.MultiHeadAttention(config.encoding_size, 8) # output layer self.out_att1 = nn.Linear(2048, 128) self.out_att2 = nn.Linear(128, 1)
def __init__(self, model_part, seq_len, d_model, d_inner, n_head, d_k, d_v, layer_sizes, learned, embedding_dim, activation, output_activation, transfer, dropout=0.1): super(Transformer, self).__init__() self.learned = learned if learned: self.embed = nn.Embedding(AMINO_ACID, LEARNED_DIM, padding_idx=0) self.posembed = PositionEmbedding(seq_len, d_model, "Sinusoid") self.d_model = d_model self.slf_attn = attention.MultiHeadAttention( n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) self.model_part = model_part self.attn, output_size = aggregate_feature(model_part, d_model, seq_len) self.transfer = transfer if transfer: output_size = output_size * 2 self.layer_num = len(layer_sizes) if self.layer_num > 0: self.denses = nn.ModuleList() self.dactive = nn.ModuleList() for i, layer_size in enumerate(layer_sizes): self.denses.append(nn.Linear(output_size, layer_size)) output_size = layer_size self.dactive.append(get_activation(activation)) self.output_layer = nn.Linear(output_size, embedding_dim) self.output_active = get_activation(output_activation)
def __init__(self, d_w, d_e, num_classes, hidden_dim, word_emb_weight, num_layers=4, num_heads=8, dropout=0.1, max_sen_len=100): super(Transformer, self).__init__() self.max_sen_len = max_sen_len self.w2v = nn.Embedding.from_pretrained(word_emb_weight, freeze=False) self.pos_embedding1 = nn.Embedding(2 * self.max_sen_len, d_e) self.pos_embedding2 = nn.Embedding(2 * self.max_sen_len, d_e) c = copy.deepcopy d_model = d_w + 2 * d_e self_attn = attention.MultiHeadAttention(h=num_heads, d_model=d_model, dropout=dropout) ff = layers.PositionwiseFeedForward(d_model=d_model, d_ff=hidden_dim, dropout=dropout) word_attn = attention.WordAttention( d_model) # (batch, sen, d_model) => (batch, d_model) self.model = nn.Sequential( layers.Encoder( layers.EncoderLayer(d_model, c(self_attn), c(ff), dropout), num_layers), word_attn, nn.Linear(d_model, d_model // 2), nn.ReLU(), nn.Linear(d_model // 2, num_classes)) for p in self.model.parameters(): if p.dim() > 1: # dim: 维度数 nn.init.xavier_uniform_(p)
def _self_attention(self, x, attention_bias=None): with tf.variable_scope('self-attention'): multi_head_attention = attention.MultiHeadAttention( num_heads=self.num_heads, linear_key_dim=self.linear_key_dim, linear_value_dim=self.linear_value_dim, hidden_size=self.hidden_size, dropout=self.dropout, attention_bias=attention_bias) return multi_head_attention.build(x, x, x)
def __init__(self, d_w, d_e, num_heads, num_layers, hidden_dim, window_sizes, num_filter, dropout_p, is_gpu, num_classes=2): super(CharAttnModelHelper, self).__init__() self.w2v = nn.Embedding(97, d_w) self.pos_embedding = nn.Embedding(842, d_e) self.is_gpu = is_gpu c = copy.deepcopy d_model = d_w + d_e self.cnn_layer1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=d_model, kernel_size=(3, d_model), stride=(1, 1), padding=(1, 0)) # (batch, d_model, max_sen_len, 1) ) self.cnn_layer1.apply(self.weights_init) self_attn = attention.MultiHeadAttention(h=num_heads, d_model=d_model, dropout=dropout_p) ff = layers.PositionwiseFeedForward(d_model=d_model, d_ff=hidden_dim, dropout=dropout_p) self.self_attn_layer = nn.Sequential( layers.Encoder( layers.EncoderLayer(d_model, c(self_attn), c(ff), dropout_p), num_layers)) # (batch, max_sen_len, d_w + d_e) for p in self.self_attn_layer.parameters(): if p.dim() > 1: # dim: 维度数 nn.init.xavier_uniform_(p) self.cnn_layer2 = CNNLayers(d_model, num_filter, window_sizes, dropout_p, is_gpu) # (batch, len(window_sizes), num_filter) => (batch, num_filter) self.word_attn = attention.WordAttention(num_filter) for p in self.word_attn.parameters(): if p.dim() > 1: # dim: 维度数 nn.init.xavier_uniform_(p) self.linear_layer = nn.Sequential( nn.Linear(num_filter, num_filter // 2), nn.Dropout(dropout_p), nn.Tanh(), nn.Linear(num_filter // 2, num_classes)) self.linear_layer.apply(self.weights_init)
def __init__(self, model_dim, n_head, key_dim, value_dim, hidden_dim, dropout=0.1): super(DecoderLayer, self).__init__() self.self_attention = attention.MultiHeadAttention(model_dim=model_dim, n_head=n_head, key_dim=key_dim, value_dim=value_dim, dropout=dropout) self.layer_norm_1 = nn.LayerNorm(normalized_shape=model_dim, eps=1e-12) self.encoder_attention = attention.MultiHeadAttention( model_dim=model_dim, n_head=n_head, key_dim=key_dim, value_dim=value_dim, dropout=dropout) self.layer_norm_2 = nn.LayerNorm(normalized_shape=model_dim, eps=1e-12) self.ffn = feed_forward.PositionwiseFeedForward(model_dim=model_dim, hidden_dim=hidden_dim) self.layer_norm_3 = nn.LayerNorm(normalized_shape=model_dim, eps=1e-12)
def __init__(self, size, n_heads, dropout): super().__init__() self.attention = attention.MultiHeadAttention(size, n_heads) self.dropout = nn.Dropout(dropout) self.layer_norm = nn.LayerNorm(size)