def __init__(self, h, d_model, dropout=0.1): super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 self.d_k = d_model // h # We assume d_v always equals d_k self.h = h self.linears = clones(nn.Linear(d_model, d_model), 4) self.dropout = nn.Dropout(dropout)
def __init__(self, size, attention, feed_forward, keep_prob): super(EncoderLayerGoogle, self).__init__() self.size = size self.attention = attention self.feed_forward = feed_forward # Each encoder layer has two sublayers self.sublayer = clones(ResidualConnectionGoogle(size, keep_prob), 2)
def __init__(self, size, self_attn, src_attn, feed_forward, dropout): super(DecoderLayer, self).__init__() self.self_attn = self_attn self.src_attn = src_attn self.feed_forward = feed_forward self.sublayers = clones(SublayerConnection(size, dropout), 3) self.size = size
def __init__(self, heads=8, d_model=512, keep_prob=0.1): super(MultiHeadedAttentionGoogle, self).__init__() assert d_model % heads == 0 self.d_k = d_model // heads self.heads = heads self.linears = clones(nn.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(keep_prob)
def __init__(self, head_nums, d_model, dropout=0.1): super(MultiHeadAttention, self).__init__() # 保证d_model即维数可以被头的数量head_nums整除 assert d_model % head_nums == 0 self.d_k = int(d_model / head_nums) self.head_nums = head_nums self.linears = clones( nn.Linear(d_model, d_model), 4) #初始化了4个,前三个用于Q,K,V向量化,最后一个用于MultiHead-Attention最后的部分 self.attn = None self.dropout = nn.Dropout(p=dropout)
def __init__(self, config): super(MultiHeadedAttentiveModule, self).__init__() assert "x_dim" in config and "y_dim" in config and "head_num" in config # Attention layer attention_config = deepcopy(config) attention_config["name"] = "Attention" self.attention = Attention(attention_config) self.input_dim = config["x_dim"] self.output_dim = config["y_dim"] self.head_num = config["head_num"] assert self.input_dim % self.head_num == 0 self.sub_input_dim = self.input_dim // self.head_num self.input_linears = utils.clones( nn.Linear(self.input_dim, self.output_dim), 3) self.output_linear = nn.Linear(self.output_dim, self.output_dim) self.is_layer_norm = config[ "is_layer_norm"] if "is_layer_norm" in config else True if self.is_layer_norm: # Attention layer norm self.attention_layer_norm = nn.LayerNorm([self.output_dim], eps=1e-6) # FFN layer norm self.ffn_layer_norm = nn.LayerNorm([self.output_dim], eps=1e-6) self.ffn = FFN({ "name": "FFN", "input_dim": self.output_dim, "out_dim_0": self.output_dim, "out_dim_1": self.output_dim }) self.name = config[ "name"] if "name" in config else "MultiHeadAttentiveModule" logger.info( utils.generate_module_info(self.name, "head_num", self.head_num, "input_dim", self.input_dim, "output_dim", self.output_dim, "is_layer_norm", self.is_layer_norm))
def __init__(self, layer, N): super(Encoder, self).__init__() self.layers = clones(layer, N) self.norm = LayerNorm(layer.size)
def __init__(self, layer, num_layers): super(EncoderBlockGoogle, self).__init__() self.layers = clones(layer, num_layers) self.norm = LayerNormGoogle(layer.size)
def __init__(self, config): super(BERTSMNModel, self).__init__() # hyperparameters self.bert_hidden_size = config["bert_hidden_size"] if "bert_hidden_size" in config else 768 self.hidden_size = config["hidden_size"] if "hidden_size" in config else 200 self.rnn_units = config["rnn_units"] if "rnn_units" in config else 200 self.bert_layers = config["bert_layers"] if "bert_layers" in config else [11] self.feature_maps = config["feature_maps"] if "feature_maps" in config else 8 self.dense_out_dim = config["dense_out_dim"] if "dense_out_dim" in config else 50 self.drop_prob = config["drop_prob"] if "drop_prob" in config else 0.0 self.max_num_utterance = config["max_num_utterance"] if "max_num_utterance" in config else 10 self.max_sentence_len = config["max_sentence_len"] if "max_sentence_len" in config else 50 self.final_out_features = config["final_out_features"] if "final_out_features" in config else 2 self.device = config["device"] assert "bert_model_dir" in config self.bert_model_dir = config["bert_model_dir"] self.bert_trainable = config["bert_trainable"] # build model # network self.bert_config = BertConfig.from_json_file(os.path.join(self.bert_model_dir, 'bert_config.json')) # self.output_layernorm = BertLayerNorm(self.bert_config) self.activation = gelu self.dropout = nn.Dropout(self.drop_prob) ## Sentence GRU: default batch_first is False self.sentence_gru = nn.GRU(self.bert_hidden_size, self.hidden_size, batch_first=True) ## Linear Transformation self.a_matrix = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False) self.a_matrixs = utils.clones( nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=False), len(self.bert_layers)) ## Convolution Layer ## valid cross-correlation padding, 2 in_channels, 8 out_channels, kernel_size 3*3 ## relu activation function and 2d valid max_pooling in_channels = 1 + len(self.bert_layers) self.conv1 = nn.Sequential(OrderedDict([ ("conv1", nn.Conv2d(in_channels=in_channels, out_channels=self.feature_maps, kernel_size=(3, 3))), ("batchnorm", nn.BatchNorm2d(self.feature_maps)), ("relu1", nn.ReLU()), ("pool1", nn.MaxPool2d(kernel_size=(3, 3), stride=(3, 3))) ])) ## Dense: fully connected layer in_features = op.calculate_dim_with_initialDim_conv((self.max_sentence_len, self.max_sentence_len), self.conv1, in_channels=in_channels) self.dense = nn.Sequential(OrderedDict([ ("linear1", nn.Linear(in_features=in_features, out_features=self.dense_out_dim)), ("tanh1", nn.Tanh()) ])) ## Final GRU: time major self.final_gru = nn.GRU(self.dense_out_dim, self.rnn_units) ## SMN Last: Linear Transformation self.smn_last_linear = nn.Linear(self.rnn_units, self.final_out_features) self.apply(self.init_weights) ## Bert pretrained model self.bert = BertModelWrapper.from_pretrained(self.bert_model_dir, cache_dir=None) # self.emb_linear = nn.Linear(self.bert_hidden_size, self.hidden_size) # self.ctxemb_linear = nn.Linear(self.bert_hidden_size, self.hidden_size) # self.dense_linear = nn.Linear(self.max_sentence_len * self.max_sentence_len, self.final_out_features) self.hidden1 = None self.hidden2 = None self.hidden3 = None