def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(DecoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, num_heads, dff, rate=0.1): super(DecoderLayer, self).__init__() self.mha1 = MultiHeadAttention(d_model, num_heads) self.mha2 = MultiHeadAttention(d_model, num_heads) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) self.dropout3 = tf.keras.layers.Dropout(rate)
def __init__(self, d_input, d_inner, n_head, d_K, d_V, dropout_rate=0.1): super(DecoderLayer, self).__init__() self.input_mh_att = MultiHeadAttention(n_head, d_input, d_K, d_V, dropout_rate=dropout_rate) self.enc_mh_att = MultiHeadAttention(n_head, d_input, d_K, d_V, dropout_rate=dropout_rate) self.pos_nn = PositionWiseFeedForward(d_input, d_inner, dropout_rate=dropout_rate)
def __init__(self): super(VisitNet_v2, self).__init__() self.gru_1 = nn.GRU(128, 128, bidirectional=True, batch_first=True, dropout=0.2) self.gru_2 = nn.GRU(256, 256, bidirectional=True, batch_first=True, dropout=0.2) self.slf_attn = MultiHeadAttention(8, 512, 64, 64, dropout=0.1) self.convs = nn.ModuleList([ nn.Sequential( nn.Conv1d(24, 32, kernel_size=3, stride=1, padding=1, bias=False), nn.ReLU(), ) for h in [1, 3, 5, 7] ]) self.linear = nn.Linear(512, 256)
def __init__(self): super().__init__() self.bert = BertModel.from_pretrained('bert_base/') if args.bert_freeze: for param in self.bert.parameters(): param.requires_grad = False self.lstm = BiLSTM( input_size=args.bert_hidden_size + args.cnn_output_size, hidden_size=args.rnn_hidden_size + args.cnn_output_size, num_layers=args.rnn_num_layers, num_dirs=args.rnn_num_dirs) self.lstm_dropout = nn.Dropout(p=args.rnn_dropout) self.cnn = CharCNN(embedding_num=len(CHAR_VOCAB), embedding_dim=args.cnn_embedding_dim, filters=eval(args.cnn_filters), output_size=args.cnn_output_size) self.crf = CRF(target_size=len(VOCAB) + 2, use_cuda=args.crf_use_cuda) self.linear = nn.Linear(in_features=args.rnn_hidden_size + args.cnn_output_size, out_features=len(VOCAB) + 2) self.attn = MultiHeadAttention(model_dim=args.rnn_hidden_size + args.cnn_output_size, num_heads=args.attn_num_heads, dropout=args.attn_dropout) self.feat_dropout = nn.Dropout(p=args.feat_dropout)
def __init__(self, embedding_space, qkv_dim, heads, sequence_length): super(Encoder).__init__() self.multi_attention = MultiHeadAttention( embedding_space=embedding_space, heads=heads, qkv_dim=qkv_dim) self.residual = ResidualNorm() self.linear = nn.Linear([sequence_length, qkv_dim], [sequence_length, qkv_dim]) self.relu = nn.ReLU()
def __init__(self, n_head, d_model, d_q, d_k, d_v, d_affine, dropout=0.1, fc_dorpout=0.5): super.__init__() self.selfAttn = MultiHeadAttention(n_head, d_model, d_q, d_k, d_v, dropout) self.feedForward = PointWiseFeedForward(d_model, d_affine, fc_dorpout)
def __init__(self, n_head, d_input, d_model, globalnode, n_graph=1): super(GNN_Att_Layer, self).__init__() self.linear = nn.Linear(d_input, d_model) self.n_head = n_head self.attention = MultiHeadAttention(n_head=n_head, d_input=d_input, d_model=d_model) self.globalnode = globalnode if globalnode: self.g_node = GlobalNode(d_input, d_model)
def __init__(self, d_model, num_heads, dff, name, rate=0.1): super(EncoderLayer, self).__init__(name=name) self.mha = MultiHeadAttention(d_model, num_heads) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.experimental.LayerNormalization(epsilon=1e-6,name=name+'_LN1') self.layernorm2 = tf.keras.layers.experimental.LayerNormalization(epsilon=1e-6,name=name+'_LN2') self.dropout1 = tf.keras.layers.Dropout(rate,name=name+'_dp1') self.dropout2 = tf.keras.layers.Dropout(rate,name=name+'_dp2')
def __init__( self, d_model: int = 512, # dimension of model num_heads: int = 8, # number of attention heads d_ff: int = 2048, # dimension of feed forward network dropout_p: float = 0.3, # probability of dropout ffnet_style: str = 'ff' # style of feed forward network ) -> None: super(SpeechTransformerEncoderLayer, self).__init__() self.self_attention = AddNorm(MultiHeadAttention(d_model, num_heads), d_model) self.feed_forward = AddNorm(PositionWiseFeedForwardNet(d_model, d_ff, dropout_p, ffnet_style), d_model)
def decoder_layer(units, d_model, d_enc_outputs, num_heads, dropout, name="decoder_layer"): inputs = tf.keras.Input(shape=(None, d_model), name="inputs") enc_outputs = tf.keras.Input(shape=(None, d_enc_outputs), name="encoder_outputs") look_ahead_mask = tf.keras.Input(shape=(1, None, None), name="look_ahead_mask") attention1 = MultiHeadAttention(d_model, num_heads, name="attention_1")(inputs={ "query": inputs, "key": inputs, "value": inputs, "mask": look_ahead_mask }) attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention1 + inputs) attention2 = MultiHeadAttention(d_model, num_heads, name="attention_2")(inputs={ "query": attention1, "key": enc_outputs, "value": enc_outputs }) attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2) attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention2 + attention1) outputs = tf.keras.layers.Dense(units=units, activation="relu")(attention2) outputs = tf.keras.layers.Dense(units=d_model)(outputs) outputs = tf.keras.layers.Dropout(rate=dropout)(outputs) outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(outputs + attention2) return tf.keras.Model(inputs=[inputs, enc_outputs, look_ahead_mask], outputs=outputs, name=name)
def __init__(self, d_model, d_keys, d_values, n_heads, d_ff, dropout=0.1): super().__init__() self.masked_attention_head = MultiHeadAttention(d_model, d_keys, d_values, n_heads, dropout=dropout) self.attention_head = MultiHeadAttention(d_model, d_keys, d_values, n_heads, dropout=dropout) self.feed_forward = nn.Sequential( nn.Linear(d_model, d_ff), nn.ReLU(), nn.Linear(d_ff, d_model), ) self.layer_norm1 = nn.LayerNorm(d_model) self.layer_norm2 = nn.LayerNorm(d_model) self.layer_norm3 = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout)
def __init__(self, hidden_dim, num_heads, inner_dim, dropout): super(EncoderLayer, self).__init__() self.self_attention = MultiHeadAttention(hidden_dim, num_heads, dropout) self.feedforward = PositionwiseFeedforward(hidden_dim, inner_dim, dropout) self.attention_norm = nn.LayerNorm(hidden_dim) self.feedforward_norm = nn.LayerNorm(hidden_dim) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, dropout=0.1): super(TransformerBlock, self).__init__() self.norm_1 = nn.LayerNorm(d_model) self.norm_2 = nn.LayerNorm(d_model) self.attn = MultiHeadAttention(d_model, 3) self.ff = FeedForward(d_model) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout)
def __init__(self, d_input, d_inner, n_head, d_K, d_V, dropout_rate=0.1): super(EncoderLayer, self).__init__() # multi-head attention self.mh_att = MultiHeadAttention(n_head, d_input, d_K, d_V, dropout_rate=dropout_rate) # position wise feed foward nets self.pos_nn = PositionWiseFeedForward(d_input, d_inner, dropout_rate=dropout_rate)
def __init__(self, num_heads=8, input_dim=256, attn_dim=64, hidden_dim=2048): super(Encoder, self).__init__() self.attention = MultiHeadAttention(num_heads, input_dim, hidden_dim, input_dim) self.layer_norm1 = nn.LayerNorm(input_dim) self.fc1 = nn.Linear(input_dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim, input_dim) self.layer_norm2 = nn.LayerNorm(input_dim)
def __init__(self, config): super(AttentionConvolution, self).__init__() self.config = config self.relative_pos_embder = RelativePosEmbder( stadia=self.config._stadia) self.directed_attention = MultiHeadAttention( query_dim=self.config._hid_dim, key_dim=self.config._hid_dim, num_units=self.config._hid_dim, dropout_p=self.config._dropout, h=self.config._num_heads) self.reversed_attention = MultiHeadAttention( query_dim=self.config._hid_dim, key_dim=self.config._hid_dim, num_units=self.config._hid_dim, dropout_p=self.config._dropout, h=self.config._num_heads) self.direct_fc = nn.Linear(3 * self.config._hid_dim, self.config._hid_dim) self.conv_acti = get_acti_fun(self.config._activation) self.conv_norm = nn.LayerNorm(self.config._hid_dim)
def __init__(self, chanIn, chanOut, heads, skip): super().__init__() self.bn_relu = bn_relu(chanIn) self.deconv1 = nn.ConvTranspose2d(chanIn, chanOut, 3, 2, padding=1, output_padding=1) self.att = MultiHeadAttention(chanIn, chanOut, chanOut, chanOut, heads, layer_type='UP') self.bn_relu2 = bn_relu(chanOut) self.res = res_(chanOut, chanOut)
def __init__(self, d_model, num_heads, dff, rate=0.1, fast_attn=False, rand_feat=100): super(EncoderLayer, self).__init__() self.mha = MultiHeadAttention(d_model, num_heads) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate)
def __init__(self, d_model: int, num_heads: int, max_len: int): super(Encoder, self).__init__() self.d_model = d_model self.max_len = max_len # Layers for Query, Key, Value matrices self.w_query = nn.Linear(in_features=d_model, out_features=d_model) self.w_key = nn.Linear(in_features=d_model, out_features=d_model) self.w_value = nn.Linear(in_features=d_model, out_features=d_model) # multi-head attention layer self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads, mask=False) self.attn_layer_norm = nn.LayerNorm(normalized_shape=d_model) self.linear = nn.Linear(in_features=d_model, out_features=d_model) self.linear_layer_norm = nn.LayerNorm(normalized_shape=d_model)
def __init__(self, input_size): self.chn = input_size super().__init__() self.input_conv1 = nn.Conv2d(1, self.chn, 3, 1, 1) self.block1_res1 = res_(self.chn, self.chn) self.block1 = bn_relu(self.chn, self.chn, ks=3, stride=1, padding=1) self.block2_res2 = res_(self.chn, self.chn * 2, ks=1, stride=2) self.block2 = bn_relu(self.chn, self.chn * 2, ks=3, stride=2) self.block3_res3 = res_(self.chn * 2, self.chn * 4, ks=1, stride=2) self.block3 = bn_relu(self.chn * 2, self.chn * 4, ks=3, stride=2) # # # self.block2 = bn_relu(self.chn, self.chn*2,3,2) #down # self.block2_res = nn.Conv2d(self.chn, self.chn*2,1,2, padding=0) # # self.block3 = bn_relu(self.chn*2, self.chn*4, 3,2) # self.block3_res = nn.Conv2d(self.chn*2, self.chn*4,1,2, padding=0) self.bn = nn.BatchNorm2d(self.chn * 4, momentum=.997) self.mid = MultiHeadAttention(self.chn * 4, self.chn * 4, 32, 32, 8) self.up2 = nn.ConvTranspose2d(self.chn * 4, self.chn * 2, 3, 2, padding=1, output_padding=1) self.up2_1 = bn_relu(self.chn * 2, self.chn * 2, 3, 1, 1) self.up1 = nn.ConvTranspose2d(self.chn * 2, self.chn, 3, 2, padding=1, output_padding=1) self.up1_1 = bn_relu(self.chn, self.chn, 3, 1, 1) # self.up1m = MultiHeadAttention_(32,32,12,12,4) # self.bn3 = nn.BatchNorm2d(32) self.out_bn = nn.BatchNorm2d(self.chn, momentum=.997) self.drop = nn.Dropout2d(.5) self.out = bn_relu(self.chn, self.chn, 3, 1, 1) self.out_ = nn.Conv2d(self.chn, 2, 1, 1)
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1): """ :param d_model: int, 隠れ層の次元数 :param d_inner_hid: int, Position Wise Feed Forward Networkの隠れ層2層目の次元数 :param n_head: int, ヘッド数 :param d_k: int, keyベクトルの次元数 :param d_v: int, valueベクトルの次元数 :param dropout: float, ドロップアウト率 """ super(EncoderLayer, self).__init__() # Encoder内のSelf-Attention self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) # Postionwise FFN self.pos_ffn = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
def __init__(self, hparams): super(Tacotron2, self).__init__() self.train_type = 'gst' self.mask_padding = hparams.mask_padding self.fp16_run = hparams.fp16_run self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.embedding = nn.Embedding(hparams.n_symbols, hparams.symbols_embedding_dim) torch.nn.init.xavier_uniform_(self.embedding.weight.data) key_dim = 256 / hparams.num_heads self.reference_encoder = ReferenceEncoder( input_dim=hparams.linear_dim, filters=[32, 32, 64, 64, 128, 128]) self.style_attention = MultiHeadAttention(query_dim=128, key_dim=key_dim, num_units=128) self.gst_token = nn.Parameter(torch.randn(hparams.style_token, key_dim)) self.encoder = Encoder(hparams) self.decoder = Decoder(hparams) self.postnet = Postnet(hparams)
def __init__(self, d_model, n_head=8, d_ff_filter=2048, att_dropout=0.3, residual_dropout=0.0, relu_dropout=0.0, encoder_normalize_before=False): super(SelfAttEncoderLayer, self).__init__() self.layer_norm_0 = nn.LayerNorm(d_model, elementwise_affine=True) self.self_attn = MultiHeadAttention(d_model, n_head, dropout_prob=att_dropout) self.residual_dropout_prob = residual_dropout self.layer_norm_1 = nn.LayerNorm(d_model, elementwise_affine=True) self.pos_ffn = PositionwiseFeedForward(d_model, d_ff_filter, d_model, dropout_prob=relu_dropout) self.encoder_normalize_before = encoder_normalize_before
def __init__( self, num_classes: int, # number of classfication max_length: int = 150, # a maximum allowed length for the sequence to be processed hidden_dim: int = 1024, # dimension of RNN`s hidden state vector pad_id: int = 0, # pad token`s id sos_id: int = 1, # start of sentence token`s id eos_id: int = 2, # end of sentence token`s id attn_mechanism: str = 'multi-head', # type of attention mechanism num_heads: int = 4, # number of attention heads num_layers: int = 2, # number of RNN layers rnn_type: str = 'lstm', # type of RNN cell dropout_p: float = 0.3, # dropout probability device: str = 'cuda' # device - 'cuda' or 'cpu' ) -> None: super(Speller, self).__init__(hidden_dim, hidden_dim, num_layers, rnn_type, dropout_p, False, device) self.num_classes = num_classes self.num_heads = num_heads self.num_layers = num_layers self.max_length = max_length self.eos_id = eos_id self.sos_id = sos_id self.pad_id = pad_id self.attn_mechanism = attn_mechanism.lower() self.embedding = nn.Embedding(num_classes, hidden_dim) self.input_dropout = nn.Dropout(dropout_p) if self.attn_mechanism == 'loc': self.attention = AddNorm(LocationAwareAttention(hidden_dim, smoothing=True), hidden_dim) elif self.attn_mechanism == 'multi-head': self.attention = AddNorm(MultiHeadAttention(hidden_dim, num_heads), hidden_dim) elif self.attn_mechanism == 'additive': self.attention = AdditiveAttention(hidden_dim) elif self.attn_mechanism == 'scaled-dot': self.attention = AddNorm(ScaledDotProductAttention(hidden_dim), hidden_dim) else: raise ValueError("Unsupported attention: %s".format(attn_mechanism)) self.projection = AddNorm(Linear(hidden_dim, hidden_dim, bias=True), hidden_dim) self.generator = Linear(hidden_dim, num_classes, bias=False)
def __init__(self, n_input_feats, d_model, n_heads, d_ff, kernel_size, n_blocks, embedder, dropout=0., only_see_past=True, full_att=False, n_blocks_strided=None, residual=True, **kwargs): super().__init__() n_blocks_strided = n_blocks // 2 if n_blocks_strided is None else n_blocks_strided strides = [2 if i < n_blocks_strided else 1 for i in range(n_blocks)] self.residual = residual self.embedder = embedder self.input_proj = nn.Sequential(nn.Linear(n_input_feats, d_model), nn.ReLU(inplace=True), nn.LayerNorm(d_model)) self.blocks = nn.ModuleList([ ConvMultipleDilationBlock( d_model, n_heads, kernel_size, d_ff, dropout=dropout, only_see_past=only_see_past, self_attn=True if i % 2 == 0 or full_att else False, stride=s) for i, s in enumerate(strides) ]) self.attn = MultiHeadAttention(d_model, d_model // n_heads, d_model // n_heads, n_heads, dropout=dropout)
def __init__(self, config): super(EncoderBlock, self).__init__() d_model = config['d_model'] num_heads = config['num_attention_heads'] num_fc_units = config['intermediate_size'] self_attn_dropout_rate = config['attention_dropout_rate'] fc_dropout_rate = config['dropout_rate'] self.self_attention_layer = MultiHeadAttention(d_model, num_heads) self.ffn = tf.keras.Sequential([ tf.keras.layers.Dense(num_fc_units, activation='relu'), tf.keras.layers.Dense(d_model) ]) self.self_attn_layernorm = tf.keras.layers.LayerNormalization( epsilon=1e-6) self.fc_layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.self_attn_dropout = tf.keras.layers.Dropout( self_attn_dropout_rate) self.fc_dropout = tf.keras.layers.Dropout(fc_dropout_rate)
def __init__(self, d_model, n_heads, kernel_size, d_ff, dropout=0., only_see_past=True, self_attn=True, **kwargs): super().__init__() self.kernel_size = kernel_size self.only_see_past = only_see_past self.self_attn = self_attn padding = 0 if only_see_past else (kernel_size - 1) // 2 self.conv = nn.Conv1d(in_channels=d_model, out_channels=2 * d_model, kernel_size=kernel_size, padding=padding) if self.self_attn: d_keys_vals = d_model // n_heads self.attn = MultiHeadAttention(d_model, d_keys_vals, d_keys_vals, n_heads, dropout=dropout) self.attn_norm = nn.LayerNorm(d_model) self.feed_forward = nn.Sequential(nn.Dropout(dropout), nn.Linear(d_model, d_ff), nn.ReLU(inplace=True), nn.Dropout(dropout), nn.Linear(d_ff, d_model), nn.ReLU(inplace=True), nn.LayerNorm(d_model))
def __init__(self, ninp, nhid, num_blocks_in, num_blocks_out, topkval, step_att, do_gru, num_modules_read_input=2, device=None): super(BlocksCore, self).__init__() self.nhid = nhid self.num_blocks_in = num_blocks_in self.num_blocks_out = num_blocks_out self.block_size_in = nhid // num_blocks_in self.block_size_out = nhid // num_blocks_out self.ninp = ninp self.topkval = topkval self.step_att = step_att self.do_gru = do_gru self.num_modules_read_input = num_modules_read_input # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Blocks Core Initialize~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") # print("nhid: ", nhid) # print("num_blocks_in: ", num_blocks_in) # print("num_blocks_out: ", num_blocks_out) # print("block_size_in: ", self.block_size_in) # print("block_size_out: ", self.block_size_out) # print("topkval: ", topkval) # input() self.mha = MultiHeadAttention(n_head=4, d_model_read=self.block_size_out, d_model_write=self.block_size_out, d_model_out=self.block_size_out, d_k=16, d_v=16, num_blocks_read=self.num_blocks_out, num_blocks_write=self.num_blocks_out, topk=self.num_blocks_out, grad_sparse=False) self.att_out = self.block_size_out * 4 self.inp_att = MultiHeadAttention( n_head=1, d_model_read=self.block_size_out, d_model_write=ninp, d_model_out=self.att_out, d_k=64, d_v=self.att_out, num_blocks_read=num_blocks_out, num_blocks_write=num_modules_read_input, residual=False, topk=self.num_blocks_in + 1, grad_sparse=False, skip_write=True) if do_gru: self.block_lstm = BlockGRU(self.att_out * self.num_blocks_out, self.nhid, k=self.num_blocks_out) else: self.block_lstm = BlockLSTM(self.att_out * self.num_blocks_out, self.nhid, k=self.num_blocks_out) self.device = device
def __init__(self, char2idx, bichar2idx, seg2idx, pos2idx, ner2idx, lexicon2idx, label2idx, longest_text_len, ): super(Luban7, self).__init__() self.char2idx = char2idx self.bichar2idx = bichar2idx self.seg2idx = seg2idx self.label2idx = label2idx self.pos2idx = pos2idx self.lexicon2idx = lexicon2idx """ Embedding Layer """ self.embeds = MixEmbedding(char_vocab_size=len(char2idx), char_emb_size=config.char_emb_size, char_dropout=config.drop_char, seg_vocab_size=len(seg2idx), seg_emb_size=config.seg_emb_size, seg_dropout=config.drop_segpos, bichar_vocab_size=len(bichar2idx), bichar_emb_size=config.bichar_emb_size, pos_vocab_size=len(pos2idx), pos_emb_size=config.pos_emb_size, pos_dropout=config.drop_segpos, sparse=config.use_sparse_embed == "on") if config.char_emb_size > 0 and config.char_emb_pretrain != 'off': load_word2vec(embedding=self.embeds.char_embeds, word2vec_path=config.char_emb_pretrain, norm=True, word_dict=self.char2idx, cached_name="char" if config.load_from_cache == "on" else None ) if config.bichar_emb_size > 0 and config.bichar_emb_pretrain != 'off': load_word2vec(embedding=self.embeds.bichar_embeds, word2vec_path=config.bichar_emb_pretrain, norm=True, word_dict=self.bichar2idx, cached_name="bichar" if config.load_from_cache == "on" else None ) self.embeds.show_mean_std() embed_dim = self.embeds.embedding_dim if config.token_type == "tfer": self.token_encoder = TransformerEncoderV2( d_model=embed_dim, len_max_seq=config.max_sentence_length, n_layers=config.tfer_num_layer, n_head=config.tfer_num_head, d_head=config.tfer_head_dim, dropout=config.drop_token_encoder ) token_dim = embed_dim elif config.token_type == "rnn": self.token_encoder = BiRNNTokenEncoder( cell_type='lstm', num_layers=config.rnn_num_layer, input_size=embed_dim, hidden_size=config.rnn_hidden, dropout=config.drop_token_encoder ) token_dim = config.rnn_hidden elif config.token_type == 'plain': token_dim = embed_dim else: raise Exception self.ner_score = torch.nn.Sequential( torch.nn.Linear(token_dim, token_dim * 2), torch.nn.ReLU(), torch.nn.Linear(token_dim * 2, len(ner2idx)), ) self.ner_crf = CRF(len(ner2idx), batch_first=True) """ Fragment & Context Layer""" frag_dim = 0 if config.frag_type == "rnn": self.fragment_encoder = FragmentEnumerator( max_span_len=config.max_span_length, encoder_cls=RNNSeqEncoder, encoder_args=('lstm', token_dim, token_dim), fusion=config.frag_fusion ) elif config.frag_type == "fofe": self.fragment_encoder = FragmentEnumerator( max_span_len=config.max_span_length, encoder_cls=FofeSeqEncoder, encoder_args=(config.frag_fofe_alpha,), fusion=config.frag_fusion ) elif config.frag_type == "average": self.fragment_encoder = FragmentEnumerator( max_span_len=config.max_span_length, encoder_cls=AverageSeqEncoder, encoder_args=(), fusion=config.frag_fusion ) elif config.frag_type == "off": pass else: raise Exception if config.frag_use_sos == "on": self.sos_token = torch.nn.Parameter(torch.Tensor(token_dim)) self.eos_token = torch.nn.Parameter(torch.Tensor(token_dim)) std = 1. / math.sqrt(token_dim) self.sos_token.data.uniform_(-std, std) self.eos_token.data.uniform_(-std, std) else: self.sos_token = None self.eos_token = None if config.frag_type != "off": if config.frag_fusion == 'cat': frag_dim += 2 * token_dim elif config.frag_fusion == 'add': frag_dim += token_dim else: raise Exception if config.frag_att_type != "off": self.multi_att = MultiHeadAttention( d_q=frag_dim, d_k=token_dim, d_v=token_dim, d_out=frag_dim, d_att_k=frag_dim // config.frag_att_head, d_att_v=frag_dim // config.frag_att_head, n_head=config.frag_att_head, dropout=config.drop_default ) self.att_norm = torch.nn.LayerNorm(frag_dim) frag_dim += { "cat": frag_dim, "add": 0 }[config.frag_att_type] if config.ctx_type in ['include', 'exclude']: self.context_encoder = ContextEnumerator( max_span_len=config.max_span_length, encoder_cls=RNNSeqEncoder, encoder_args=('lstm', token_dim, token_dim), out_size=token_dim, include=config.ctx_type == 'include' ) frag_dim += token_dim + token_dim """ Non Linear Stack """ self.non_linear_stack = torch.nn.ModuleList([ NonLinearLayerWithRes(frag_dim, 2 * frag_dim, dropout=config.drop_nonlinear) for _ in range(config.num_nonlinear) ]) """ Lexicon Embedding """ if config.lexicon_emb_pretrain != "off": lexicon_emb_name, lexicon_emb_dim = gen_word2vec_name_dim(config.lexicon_emb_pretrain) self.lexicon_embeds = torch.nn.Embedding(len(lexicon2idx), lexicon_emb_dim, sparse=config.use_sparse_embed == "on") load_word2vec( self.lexicon_embeds, lexicon2idx, config.lexicon_emb_pretrain, norm=True, cached_name="lexicon".format(lexicon_emb_name) if config.load_from_cache == "on" else None ) if config.match_mode in ["naive", "mix", "middle"]: if config.match_mode == "naive": match_vocab_size = len(match2idx_naive) elif config.match_mode == "mix": match_vocab_size = len(match2idx_mix) elif config.match_mode == "middle": match_vocab_size = len(match2idx_middle) else: raise Exception self.match_embeds = torch.nn.Embedding(match_vocab_size, config.match_emb_size, sparse=config.use_sparse_embed == "on") if config.match_head > 0: self.lexicon_attention = MultiHeadAttention( d_q=frag_dim, d_k=lexicon_emb_dim + config.match_emb_size, d_v=lexicon_emb_dim + config.match_emb_size, d_att_k=frag_dim // config.match_head, d_att_v=frag_dim // config.match_head, n_head=config.match_head, dropout=config.drop_default, d_out=frag_dim) frag_dim = frag_dim * 2 elif config.match_head == 0: self.lexicon_attention = VanillaAttention(query_size=frag_dim, mem_size=lexicon_emb_dim + config.match_emb_size, dropout=config.drop_default) frag_dim += lexicon_emb_dim + config.match_emb_size else: raise Exception else: raise Exception self.scorer = torch.nn.Sequential( torch.nn.Linear(frag_dim, 2 * frag_dim), torch.nn.ReLU(), torch.nn.Linear(2 * frag_dim, len(label2idx)) )