def __init__(self, d_model):
        super().__init__()
        self.d_model = d_model
        self.num_relations = 40
        self.fc_dir_weight = clones(nn.Linear(d_model, d_model, bias=False), 3)
        self.fc_dir_bias = [
            nn.Parameter(torch.zeros(d_model))
            for _ in range(self.num_relations * 2 - 1)
        ]
        self.fc_dir_bias1 = nn.ParameterList(self.fc_dir_bias[-1:])
        self.fc_dir_bias2 = nn.ParameterList(
            self.fc_dir_bias[:self.num_relations - 1])
        self.fc_dir_bias3 = nn.ParameterList(
            self.fc_dir_bias[self.num_relations - 1:-1])

        self.fc_gate_weight = clones(nn.Linear(d_model, d_model, bias=False),
                                     3)
        self.fc_gate_bias = [
            nn.Parameter(torch.zeros(d_model))
            for _ in range(self.num_relations * 2 - 1)
        ]
        self.fc_gate_bias1 = nn.ParameterList(self.fc_gate_bias[-1:])
        self.fc_gate_bias2 = nn.ParameterList(
            self.fc_gate_bias[:self.num_relations - 1])
        self.fc_gate_bias3 = nn.ParameterList(
            self.fc_gate_bias[self.num_relations - 1:-1])
Example #2
0
 def __init__(self, size, self_attn, feed_forward, dropout):
     super(EncoderLayer, self).__init__()
     # self.self_attn is the Multi-Head Attention Layer
     self.self_attn = self_attn
     self.feed_forward = feed_forward
     self.sublayerconnections = clones(SublayerConnection(size, dropout), 2)
     self.size = size
Example #3
0
 def __init__(self, dm, dropout=0.1):
     super(EncoderBlock, self).__init__()
     self.pe = PositionalEncoding(dm, dropout)
     self.self_attn = Attn()
     self.ffn = PositionWiseFFN(dm, dm // 2)
     self.dropout = dropout
     self.highways = utils.clones(HighWay(dm, dropout), 2)
 def __init__(self, size, dropout, self_attn, feed_forward):
     super(EncoderLayer, self).__init__()
     self.size = size
     self.dropout = dropout
     self.self_attn = self_attn
     self.sublayers = clones(SubLayer(size, dropout), 2)
     self.feed_forward = feed_forward
Example #5
0
 def __init__(self, size, self_attn, feed_forward, dropout):
     super(EncoderLayer, self).__init__()
     self.self_attn = self_attn
     self.feed_forward = feed_forward
     self.sublayer = utils.clones(SublayerConnection(size, dropout), 2)
     self.size = size
     self.local_rnn = LocalRNNLayer(size, dropout)
Example #6
0
 def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
     super(DecoderLayer, self).__init__()
     self.size = size
     self.self_attn = self_attn
     self.src_attn = src_attn
     self.feed_forward = feed_forward
     self.sublayer = clones(SublayerConnection(size, dropout), 3)
Example #7
0
 def __init__(self, head, d_embedding, dropout=0.1):
     super(MultiHeadAttention, self).__init__()
     assert d_embedding % head == 0
     self.d_k = d_embedding // head
     self.head = head
     self.linears = clones(nn.Linear(d_embedding, d_embedding), 4)
     self.attn = None
     self.dropout = nn.Dropout(p=dropout)
Example #8
0
 def __init__(self, N, d_model, h, dropout, bidirectional, mix=False):
     super(Transformer, self).__init__()
     attn = MultiHeadedAttention(h, d_model)
     ff = PositionwiseFeedForward(d_model, dropout=dropout)
     self.bidirectional = bidirectional
     self.model = Encoder(EncoderLayer(d_model, attn, ff, dropout), N, mix)
     if self.bidirectional:
         self.model = clones(self.model, 2)
Example #9
0
 def __init__(self, num_attention_layers, dim_model, dropout=0.1):
     super(MultiHeadAttention, self).__init__()
     self.h = num_attention_layers
     self.dropout = dropout
     self.d_model = dim_model
     self.d_k = dim_model // self.h
     self.linears = clones(nn.Linear(dim_model, dim_model), 4)
     self.attn = None
     self.drop = nn.Dropout(self.dropout)
Example #10
0
 def __init__(self, h, d_model, dropout=0.1):
     # "词向量长度和多头数目"
     super(MultiHeadedAttention, self).__init__()
     assert d_model % h == 0
     self.d_k = d_model // h
     self.h = h
     self.linears = clones(nn.Linear(d_model, d_model), 4)
     self.attn = None
     self.dropout = nn.Dropout(p=dropout)
Example #11
0
    def __init__(self, model_dim, head_count, dropout):
        super(MultiHeadAttn, self).__init__()
        self.model_dim = model_dim
        assert model_dim % head_count == 0
        self.dim_per_head = model_dim // head_count
        self.head_count = head_count

        self.linear_layers = clones(nn.Linear(model_dim, model_dim), 4)
        self.dropout = nn.Dropout(dropout)
Example #12
0
    def __init__(self, decoder_layer, num_layers):
        """Initializer.

    Args:
      decoder_layer: (DecoderLayer).
      num_layers: (int) number of decoder layers in stack.
    """
        super(Decoder, self).__init__()
        self.decoder_layers = utils.clones(decoder_layer, num_layers)
        self.layer_norm = LayerNorm(decoder_layer.model_size)
 def __init__(self, size, dropout, self_attn, src_attn,
              feed_forward, d_model, vocab):
     super(DecoderLayer, self).__init__()
     self.size = size
     self.dropout = nn.Dropout(dropout)
     self.attn = self_attn
     self.src_attn = src_attn
     self.sub_layers = clones(SubLayer(size, dropout), 3)
     self.feed_forward  = feed_forward
     self.generator = Generator(d_model=d_model, vocab=vocab)
 def __init__(self, h, d_model, dropout=0.1):
     "Take in model size and number of heads."
     super(MultiHeadedAttention, self).__init__()
     assert d_model % h == 0
     # We assume d_v always equals d_k
     self.d_k = d_model // h
     self.h = h
     self.linears = clones(linear.Linear(d_model, d_model), 4)
     self.attn = None
     self.dropout = nn.Dropout(p=dropout)
Example #15
0
 def __init__(self, h: int, d_model: int, dropout=0.1):
     """Take in model size and number of heads"""
     super(MultiHeadAttention, self).__init__()
     assert d_model % h == 0
     self.d_k = d_model // h  # ex: h=8, d_model=512, d_k=64
     self.h = h
     # 4 linear modules from d_model to d_model
     self.linears = clones(nn.Linear(d_model, d_model), 4)
     self.attn = None  # attention weights computed
     self.dropout = nn.Dropout(p=dropout)
Example #16
0
 def __init__(self,
              size: int,
              self_attn: MultiHeadAttention,
              feed_forward: PositionwiseFeedForward,
              dropout=0.1):
     super(EncoderLayer, self).__init__()
     self.size = size
     self.self_attn = self_attn
     self.feed_forward = feed_forward
     # 2 sub-layers: 1 self-attention + 1 feed-forward
     self.sublayer = clones(SublayerConnection(size, dropout), 2)
    def __init__(self, d_model, k, num_heads, num_features, dropout=0):
        super(TreeRelativePosition, self).__init__()

        self.d_model = d_model
        self.k = k
        self.num_features = num_features
        self.num_heads = num_heads

        self.dropout = nn.Dropout(dropout)
        self.emb_list = clones(nn.Embedding(2 * k + 2, d_model * 2),
                               num_features)
Example #18
0
 def __init__(self, h, d_model, dropout=0.1):
     "Take in model size and number of heads."
     super(MultiHeadedAttention, self).__init__()
     assert d_model % h == 0
     # We assume d_v always equals d_k
     self.d_k = d_model // h  # The output dim of each head
     self.h = h  # Number of heads
     # The former 3 linear modules are the combined {W_i}^Q, {W_i}^K, {W_i}^V
     self.linears = clones(nn.Linear(d_model, d_model), 4)
     self.attn = None
     self.dropout = nn.Dropout(p=dropout)
Example #19
0
 def __init__(self, d_model=512, h=8, d_ff=2048, dropout_rate=0.1):
     super(DecoderLayer, self).__init__()
     self.self_attn = MultiHeadedAttention(h,
                                           d_model,
                                           dropout_rate=dropout_rate)
     self.src_attn = MultiHeadedAttention(h,
                                          d_model,
                                          dropout_rate=dropout_rate)
     self.feed_forward = PositionwiseFeedForward(d_model,
                                                 d_ff,
                                                 dropout_rate=dropout_rate)
     self.sublayers = clones(SublayerConnection(d_model, dropout_rate), 3)
Example #20
0
 def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
     super(DecoderLayer, self).__init__()
     # size = d_embedding = 512,
     # self_attn: one MultiHeadAttention object between tgt_vocab
     # src_attn: second MultiHeadAttention object, betweem tgt_vocab and src_vocab
     # feed_forward: the last fully-connection layer
     # dropout = 0.1
     self.size = size
     self.self_attn = self_attn
     self.src_attn = src_attn
     self.feed_forward = feed_forward
     # Three SublayerConnection objects:
     # self.self_attn, self.src_attn, and self.feed_forward
     self.sublayer = clones(SublayerConnection(size, dropout), 3)
Example #21
0
 def __init__(self, d_model, h, dropout=0.1):
     """
     multi-head attention
     :param h: nhead
     :param d_model: d_model
     :param dropout: float
     """
     super(MultiHeadedAttention, self).__init__()
     assert d_model % h == 0
     #  assume d_v always equals d_k
     self.d_k = d_model // h
     self.h = h
     self.linears = utils.clones(nn.Linear(d_model, d_model), 4)
     self.attn = None
     self.dropout = nn.Dropout(p=dropout)
Example #22
0
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        #print d_model, h
        assert d_model % h == 0

        # self.d_k is the reduced dimension of each parallel attention
        self.d_k = d_model // h
        self.h = h

        # self.linears is a list consists of 4 projection layers
        # self.linears[0]: Concat(W^Q_i), where i \in [1,...,h].
        # self.linears[1]: Concat(W^K_i), where i \in [1,...,h].
        # self.linears[2]: Concat(W^K_i), where i \in [1,...,h].
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
Example #23
0
    def __init__(self, config):
        super(TOI_BERT, self).__init__()
        self.config = config
        self.outfile = None

        self.input_size_bert = config.input_size_bert
        self.input_size = self.input_size_bert if config.use_bert else 0

        if self.config.if_DTE:
            self.dte = DTE(config)
            self.input_size += self.dte.input_size

        if self.config.use_cnn:
            self.res_nets = clones(TOI_CNN_RES(self.input_size, self.input_size, kernal_size=self.config.kernel_size), self.config.cnn_block)
        else:
            self.project = nn.Sequential(
               nn.Linear(self.input_size, self.input_size),
               nn.Dropout(0.5),
               nn.ReLU()
           )

        self.hat_1 = TOI_Pooling(self.input_size, self.config.if_gpu, self.config.hit_pooling_size)

        self.pooling_size = 2 + self.config.hit_pooling_size

        self.one_step_to_share=nn.Sequential(
            nn.Linear(self.input_size * self.pooling_size, self.config.nested_depth_fc_size),
            nn.Dropout(0.5),
            nn.ReLU(),
        )

        self.one_step_to_heaven = nn.Sequential(
            nn.Linear(self.config.nested_depth_fc_size, self.config.label_kinds),
        )

        self.one_step_to_hell = nn.Sequential(
            nn.Linear(self.config.nested_depth_fc_size, self.config.nested_depth),
        )

        if self.config.fusion:
            self.fusion = nn.Sequential(
                nn.Softmax(dim = 0)
            )
            self.fusion_parameters = torch.nn.Parameter(torch.ones(config.fusion_layer, 1))
            self.fusion_gamma = torch.nn.Parameter(torch.ones(1))

        self.cls_ce_loss = nn.CrossEntropyLoss()
    def __init__(self, h, d_k, d_model, p_drop):
        '''
        In the paper: h = 8, d_k = d_v = 64, d_model = 512, p_drop = 0.1. Assume d_k = d_v
        Check whether d_model is a multiple of h
        '''
        super(MultiHeadAttention, self).__init__()
        assert d_model % h == 0
        self.h = h
        self.d_k = d_k
        self.d_model = d_model
        self.p_drop = p_drop

        self.linear_layers = clones(nn.Linear(in_features = d_model, out_features = d_model, bias = False),4)

        self.attention = None
        self.layernorm = nn.LayerNorm(normalized_shape = d_model)
        self.drop = nn.Dropout(p = p_drop)
Example #25
0
    def __init__(self, model_size, multi_headed_attention, feed_foward,
                 dropout_rate):
        """Initializer.

    Args:
      model_size: (int) model input feature size.
      multi_headed_attention: (MultiHeadedAttention).
      feed_forward: (PositionwiseFeedForward).
      dropout_rate: (float) dropout rate.
    """
        super(EncoderLayer, self).__init__()
        self.self_attention = multi_headed_attention
        self.feed_forward = feed_foward
        # Two sublayers: one applied after self-attention, the other
        #   applied after position-wise feedforward.
        self.sublayer = utils.clones(Sublayer(model_size, dropout_rate), 2)
        self.model_size = model_size
Example #26
0
    def __init__(self, num_heads, model_size, dropout_rate=0.1):
        """Initializer.

    Args:
      num_heads: (int) number of attention heads.
      model_size: (int) model input feature size.
      dropout_rate: (float) dropout rate.
    """
        super(MultiHeadedAttention, self).__init__()
        assert model_size % num_heads == 0
        self.head_size = model_size // num_heads
        self.num_heads = num_heads
        # Linear projections for query, key, value, and the output
        #   of multi-headed attention layer. 4 in total.
        self.linears = utils.clones(nn.Linear(model_size, model_size), 4)
        self.attention = None
        self.dropout = nn.Dropout(p=dropout_rate)
Example #27
0
    def __init__(self, d_model, h, max_relative_position, dropout=.0):
        """
        multi-head attention
        :param h: nhead
        :param d_model: d_model
        :param dropout: float
        """
        super(MultiHeadedAttention_RPR, self).__init__()
        assert d_model % h == 0
        #  assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = utils.clones(nn.Linear(d_model, d_model), 4)
        self.dropout = nn.Dropout(p=dropout)

        self.max_relative_position = max_relative_position
        self.vocab_size = max_relative_position * 2 + 1
        self.embed_K = nn.Embedding(self.vocab_size, self.d_k)
        self.embed_V = nn.Embedding(self.vocab_size, self.d_k)
Example #28
0
    def __init__(self, h, d_model, dropout=0.1):
        """
        :param h: num of heads(parallel attention layers)
        :param d_model: dimension of input(embedding)

        :parameter linears:
                        Wq_i [d_model, d_k] * h
                        Wk_i [d_model, d_k] * h
                        Wv_i [d_model, d_v] * h
                        Wo [d_v * h, d_model]

        """
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
Example #29
0
    def __init__(self, model_size, multi_headed_attention_1,
                 multi_headed_attention_2, feed_foward, dropout_rate):
        """Initializer.

    Args:
      model_size: (int) model input feature size.
      multi_headed_attention_1: (MultiHeadedAttention).
      multi_headed_attention_2: (MultiHeadedAttention).
      feed_forward: (PositionwiseFeedForward).
      dropout_rate: (float) dropout rate.
    """
        super(DecoderLayer, self).__init__()
        self.self_attention = multi_headed_attention_1
        self.source_attention = multi_headed_attention_2
        self.feed_forward = feed_foward
        # Three sublayers: one applied after self-attention, one applied
        #   to after source attention, the last applied after
        #   position-wise feedforward.
        self.sublayer = utils.clones(Sublayer(model_size, dropout_rate), 3)
        self.model_size = model_size
Example #30
0
    def __init__(self, size, self_attn, feed_forward, dropout):
        # size=d_embedding=512
        # self_attn = an object of MultiHeadAttention, first sublayer
        # feed_forward =  an object of PositionwiseFeedForward,second sublayer
        # dropout = 0.1 (e.g.)
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

        def forward(self, x, mask):
            # x: (batch, num_word, d_embedding)
            # mask: (batch.size, num_word, num_word), padding mask in Encoder
            # in src_vocab, all the words except the "<blank>" ones (padding mask) are visible
            # in tgt_vocab, all the words in the left of current input word are visible
            x = self.sublayer[0](x, self.self_attn(x, x, x, mask))
            # x: (batch, num_word, d_embedding), self_attn (MultiHeadAttention)
            # shape is same: (batch, num_word, d_embedding)
            # -> SublayerConnection: (batch, num_word, d_embedding)
            return self.sublayer[1](x, self.feed_forward)