Beispiel #1
0
    def __init__(
        self,
        h,
        d_model,
        p,
        d_ff,
        attn_p=0.1,
    ):
        super(LMDecoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        self.multihead_tgt = MultiHeadAttention(h,
                                                d_model,
                                                attn_p=attn_p,
                                                static=onmt.Constants.static,
                                                share=1)

        ff_p = p
        feedforward = FeedForward(d_model,
                                  d_ff,
                                  ff_p,
                                  static=onmt.Constants.static)
        self.feedforward = Bottle(feedforward)
Beispiel #2
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0):
        super(ParallelEncoderLayer, self).__init__()
        self.version = version

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)
        self.multihead = MultiHeadAttention(h,
                                            d_model,
                                            attn_p=attn_p,
                                            static=onmt.Constants.static)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
Beispiel #3
0
 def __init__(self, opt, dicts, positional_encoder):
 
     super(TransformerEncoder, self).__init__()
     
     self.model_size = opt.model_size
     self.n_heads = opt.n_heads
     self.inner_size = opt.inner_size
     self.layers = opt.layers
     self.dropout = opt.dropout
     self.word_dropout = opt.word_dropout
     self.attn_dropout = opt.attn_dropout
     self.emb_dropout = opt.emb_dropout
     self.time = opt.time
     self.version = opt.version
     
     self.word_lut = nn.Embedding(dicts.size(),
                                  self.model_size,
                                  padding_idx=onmt.Constants.PAD)
     
     if opt.time == 'positional_encoding':
         self.time_transformer = positional_encoder
     elif opt.time == 'gru':
         self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
     elif opt.time == 'lstm':
         self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)
     
     self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)
     
     self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
     
     self.positional_encoder = positional_encoder
     
     self.layer_modules = nn.ModuleList([EncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)])
 
     self.pretrained_point = -1
Beispiel #4
0
    def __init__(self,
                 opt,
                 embedding,
                 positional_encoder,
                 attribute_embeddings=None,
                 ignore_source=False):

        super(TransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.encoder_type = opt.encoder_type
        self.ignore_source = ignore_source
        self.encoder_cnn_downsampling = opt.cnn_downsampling

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        else:
            raise NotImplementedError

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.word_lut = embedding

        # Using feature embeddings in models
        if attribute_embeddings is not None:
            self.use_feature = True
            self.attribute_embeddings = attribute_embeddings
            self.feature_projector = nn.Linear(
                opt.model_size + opt.model_size * attribute_embeddings.size(),
                opt.model_size)
        else:
            self.use_feature = None

        self.positional_encoder = positional_encoder

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)

        self.build_modules()
Beispiel #5
0
 def __init__(self, opt, dicts, positional_encoder):
 
     super(StochasticTransformerDecoder, self).__init__()
     
     self.model_size = opt.model_size
     self.n_heads = opt.n_heads
     self.inner_size = opt.inner_size
     self.layers = opt.layers
     self.dropout = opt.dropout
     self.word_dropout = opt.word_dropout 
     self.attn_dropout = opt.attn_dropout
     self.emb_dropout = opt.emb_dropout
     self.time = opt.time
     self.death_rate = opt.death_rate
     
     if hasattr(opt, 'grow_dropout'):
         self.grow_dropout = opt.grow_dropout
     
     if opt.time == 'positional_encoding':
         self.time_transformer = positional_encoder
     elif opt.time == 'gru':
         self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
     elif opt.time == 'lstm':
         self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)
     
     #self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)
     self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static)
     self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
     
     self.word_lut = nn.Embedding(dicts.size(),
                                  self.model_size,
                                  padding_idx=onmt.Constants.PAD)
     
     self.positional_encoder = positional_encoder
     
     
     
     
     self.layer_modules = nn.ModuleList()       
     for l in range(self.layers):
         
         # linearly decay the death rate
         death_r = ( l + 1 ) / self.layers * self.death_rate
         
         block = StochasticDecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout, death_rate=death_r)
         
         self.layer_modules.append(block)
         
     e_length = expected_length(self.layers, self.death_rate)    
     
     print("Stochastic Decoder with %.2f expected layers" % e_length) 
     # self.layer_modules = nn.ModuleList([DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(e_length)])
     
     len_max = self.positional_encoder.len_max
     # print(len_max)
     mask = torch.ByteTensor(np.triu(np.ones((len_max,len_max)), k=1).astype('uint8'))
     self.register_buffer('mask', mask)
    def __init__(self, opt, dicts, positional_encoder):

        super(FCTransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)
        if self.version == 1.0:
            self.postprocess_layer = PrePostProcessing(self.model_size,
                                                       0,
                                                       sequence='n')

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.positional_encoder = positional_encoder

        self.layer_modules = nn.ModuleList([
            FCTDecoderLayer(self.n_heads, self.model_size, self.dropout,
                            self.inner_size, self.attn_dropout)
            for _ in range(self.layers)
        ])

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)
Beispiel #7
0
    def __init__(self, opt, embedding, positional_encoder, encoder_type='text'):

        super(TransformerEncoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1:
            self.layers = opt.encoder_layers
        else:
            self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.input_type = encoder_type
        self.cnn_downsampling = opt.cnn_downsampling
        self.channels = 1
        feature_size = opt.input_size

        if encoder_type != "text":
            if not self.cnn_downsampling:
                self.audio_trans = nn.Linear(feature_size, self.model_size)
                torch.nn.init.xavier_uniform_(self.audio_trans.weight)
            else:
                channels = self.channels
                cnn = [nn.Conv2d(channels, 64, kernel_size=(3, 3), stride=2), nn.ReLU(True), nn.BatchNorm2d(64),
                       nn.Conv2d(64, 64, kernel_size=(3, 3), stride=2), nn.ReLU(True), nn.BatchNorm2d(64)]
                self.audio_trans = nn.Sequential(*cnn)

                # self.model_size =
                feat_size = (((feature_size // channels) - 3) // 4) * 64
                assert self.model_size == feat_size, \
                    "The model dimension doesn't match with the feature dim, expecting %d " % feat_size
        else:
            self.word_lut = embedding

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')

        self.positional_encoder = positional_encoder

        self.build_modules()
Beispiel #8
0
    def __init__(self, opt, dicts, positional_encoder):

        super(TransformerEncoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1:
            self.layers = opt.encoder_layers
        else:
            self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.input_type = opt.encoder_type

        if opt.encoder_type != "text":
            self.audio_trans = nn.Linear(dicts, self.model_size)
        else:
            self.word_lut = nn.Embedding(dicts.size(),
                                         self.model_size,
                                         padding_idx=onmt.Constants.PAD)

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.build_modules()
Beispiel #9
0
 def __init__(self, opt, dicts, positional_encoder):
 
     super(StochasticTransformerEncoder, self).__init__()
     
     self.model_size = opt.model_size
     self.n_heads = opt.n_heads
     self.inner_size = opt.inner_size
     self.layers = opt.layers
     self.dropout = opt.dropout
     self.word_dropout = opt.word_dropout
     self.attn_dropout = opt.attn_dropout
     self.emb_dropout = opt.emb_dropout
     self.time = opt.time
     self.death_rate = opt.death_rate
     
     if hasattr(opt, 'grow_dropout'):
         self.grow_dropout = opt.grow_dropout
     
     self.word_lut = nn.Embedding(dicts.size(),
                                  self.model_size,
                                  padding_idx=onmt.Constants.PAD)
     
     if opt.time == 'positional_encoding':
         self.time_transformer = positional_encoder
     elif opt.time == 'gru':
         self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
     elif opt.time == 'lstm':
         self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)
     
     #self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)
     self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static)
     
     self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n', elementwise_affine=True)
     
     self.positional_encoder = positional_encoder
     
     self.layer_modules = nn.ModuleList()
     
     for l in range(self.layers):
         
         # linearly decay the death rate
         
         death_r = ( l + 1.0 ) / self.layers * self.death_rate
         
         block = StochasticEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout, death_rate=death_r)
         
         self.layer_modules.append(block)
     
     e_length = expected_length(self.layers, self.death_rate)    
     
     print("Stochastic Encoder with %.2f expected layers" % e_length) 
Beispiel #10
0
 def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
     
     super(EncoderLayer, self).__init__()
     self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n')
     
     self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da')
     
     self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da')
     
     
     self.rnn = nn.LSTM(d_model, d_model//2, 1, bidirectional=True)
     
     #~ feedforward = FeedForward(d_model, d_ff, p)
     self.ffn = FeedForward(d_model, d_ff, p)
Beispiel #11
0
    def __init__(self, opt, dicts, positional_encoder, ignore_source=False):

        super(TransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.encoder_type = opt.encoder_type
        self.ignore_source = ignore_source

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        else:
            raise NotImplementedError
        # elif opt.time == 'gru':
        #     self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
        # elif opt.time == 'lstm':
        #     self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.positional_encoder = positional_encoder

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)

        self.build_modules()
Beispiel #12
0
    def __init__(self, opt, dicts, positional_encoder):

        super(TransformerEncoder, self).__init__()

        self.model_size = opt.model_size  #dmodel which is the dimension between sublayers
        self.n_heads = opt.n_heads  #heads in multihead attention
        self.inner_size = opt.inner_size  #Size of feed forward network in sublayer
        self.layers = opt.layers  #Amount of stacked encoder/decoder layers in the model
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout  #D.S: Dropout which is applied by converting input to embedding
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.residual_dropout = opt.residual_dropout

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        #Performs Preprocessing (here its dropout)
        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        #Performs Postprocessing (here its layerNorm)
        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.build_modules()
    def __init__(self, opt, embeddings, positional_encoder, attribute_embeddings=None, generator=None):
        """
        :param opt: Options
        :param embeddings: a list of two embedding tables [src tgt]
        :param positional_encoder: The sinusoidal positional encoding
        :param attribute_embeddings: To be implemented
        """
        super(RelativeTransformer, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.encoder_type = opt.encoder_type
        self.encoder_cnn_downsampling = opt.cnn_downsampling
        self.variational_dropout = opt.variational_dropout
        self.switchout = opt.switchout
        self.death_rate = opt.death_rate
        self.layer_modules = None
        self.use_feature = False

        self.d_head = self.model_size // self.n_heads`

        if self.switchout > 0:
            self.word_dropout = 0

        self.positional_encoder = positional_encoder
        self.relative = True
        # two embedding layers for src and tgt
        self.src_word_lut = embeddings[0]
        self.tgt_word_lut = embeddings[1]
        self.generator = generator

        self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d',
                                                  variational=self.variational_dropout)

        self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')

        self.build_modules()
Beispiel #14
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
        super(RelativeTransformerEncoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.preprocess_attn_rev = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        self.d_head = d_head = d_model // h
        self.multihead_fwd = RelPartialLearnableMultiHeadAttn(h // 2,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)
        self.multihead_bwd = RelPartialLearnableMultiHeadAttn(h // 2,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)
        self.attn_out = Linear(h * self.d_head, d_model)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      static=onmt.Constants.static)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           static=onmt.Constants.static)
        self.feedforward = feedforward
Beispiel #15
0
 def __init__(self, opt, dicts):
     self.model_size = opt.model_size
     self.n_heads = opt.n_heads
     self.inner_size = opt.inner_size
     self.layers = opt.layers
     self.dropout = opt.dropout
     self.word_dropout = opt.word_dropout 
     self.attn_dropout = opt.attn_dropout
     self.emb_dropout = opt.emb_dropout
     
     super(RecurrentDecoder, self).__init__()
     
     self.word_lut = nn.Embedding(dicts.size(),
                                  self.model_size,
                                  padding_idx=onmt.Constants.PAD)
     
     self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d')
     
     self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
     
     self.layer_modules = nn.ModuleList([DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)])
    def __init__(self, opt, dicts, positional_encoder, time_encoder):

        super(UniversalTransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time

        self.positional_encoder = positional_encoder

        self.time_encoder = time_encoder

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=onmt.Constants.static)
        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.positional_encoder = positional_encoder

        self.recurrent_layer = UniversalDecoderLayer(
            self.n_heads, self.model_size, self.dropout, self.inner_size,
            self.positional_encoder, self.time_encoder, self.attn_dropout)

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)
Beispiel #17
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.variational = variational
        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  variational=self.variational)
        self.variational = variational

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = d_model // h
        self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      variational=self.variational)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           variational=self.variational)
        else:
            raise NotImplementedError
        self.feedforward = Bottle(feedforward)
Beispiel #18
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 attn_p=0.1,
                 version=1.0,
                 ignore_source=False):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.ignore_source = ignore_source

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(d_model,
                                                         p,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                d_model, p, sequence='da', static=onmt.Constants.static)
            self.multihead_src = MultiHeadAttention(
                h,
                d_model,
                attn_p=attn_p,
                static=onmt.Constants.static,
                share=2)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        # self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=1)
        d_head = d_model // h
        self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      static=onmt.Constants.static)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           static=onmt.Constants.static)
        self.feedforward = feedforward
Beispiel #19
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 pos_encoder,
                 time_encoder,
                 attn_p=0.1,
                 version=1.0):
        super(UniversalEncoderLayer, self).__init__()
        self.version = version
        # position and time embedding is added into the input before the layer
        self.pos_encoder = pos_encoder
        self.time_encoder = time_encoder

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)
        self.multihead = MultiHeadAttention(h,
                                            d_model,
                                            attn_p=attn_p,
                                            static=onmt.Constants.static)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
Beispiel #20
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
        super(FCTEncoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=True)
        #~ self.multihead = HierarchicalMultiHeadAttention(h, d_model, attn_p=attn_p)
        self.multihead = UniformMultiHeadAttention(h, d_model, attn_p=attn_p)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=True)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
    def __init__(self, opt, dicts):

        super().__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.encoder_type = opt.encoder_type

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.rnn = nn.LSTM(self.model_size,
                           self.model_size,
                           num_layers=3,
                           dropout=self.dropout)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   self.emb_dropout,
                                                   sequence='d',
                                                   static=False)

        self.h = None
        self.c = None
    def __init__(self, opt, dicts, positional_encoder, time_encoder):

        super(UniversalTransformerEncoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.positional_encoder = positional_encoder

        self.time_encoder = time_encoder

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=onmt.Constants.static)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.recurrent_layer = UniversalEncoderLayer(
            self.n_heads, self.model_size, self.dropout, self.inner_size,
            self.positional_encoder, self.time_encoder, self.attn_dropout)
Beispiel #23
0
 def add_layers(self, n_new_layer):
     
     self.new_modules = list()
     self.layers += n_new_layer
     
     for i in range(n_new_layer):
         layer = EncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) 
         
         # the first layer will use the preprocessing which is the last postprocessing
         if i == 0:
             layer.preprocess_attn = self.postprocess_layer
             # replace the last postprocessing layer with a new one
             self.postprocess_layer = PrePostProcessing(d_model, 0, sequence='n')
         
         self.layer_modules.append(layer)
 def add_layers(self, n_new_layer):
     
     self.new_modules = list()
     self.layers += n_new_layer
     
     for i in range(n_new_layer):
         layer = ParallelEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) 
         
         # the first layer will use the preprocessing which is the last postprocessing
         if i == 0:
             layer.preprocess_attn.load_state_dict(self.postprocess_layer.state_dict())
             #~ layer.preprocess_attn.layer_norm.function.weight.requires_grad = False
             #~ layer.preprocess_attn.layer_norm.function.bias.requires_grad = False
             #~ if hasattr(layer.postprocess_attn, 'k'):
                 #~ layer.postprocess_attn.k.data.fill_(0.01)
             
             # replace the last postprocessing layer with a new one
             self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
         
         self.layer_modules.append(layer)
Beispiel #25
0
 def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
     
     super(DecoderLayer, self).__init__()
     self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n')
     
     self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da')
     
     self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da')
     
     self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da')
     
     
     self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p)
     self.rnn = nn.LSTM(d_model, d_model, 1, bidirectional=False)
     feedforward = FeedForward(d_model, d_ff, p)
     self.feedforward = feedforward  
Beispiel #26
0
    def __init__(self,
                 opt,
                 dicts,
                 positional_encoder,
                 ignore_source=False,
                 feature_embedding=None):

        super(TransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.encoder_type = opt.encoder_type
        self.ignore_source = ignore_source

        self.fixed_target_length = 0

        if hasattr(opt, 'fixed_target_length'):
            if opt.fixed_target_length == "int":
                self.fixed_target_length = 1
                print('Embedding')
            elif opt.fixed_target_length == "encoding":
                self.fixed_target_length = 2
                print('Encoding')
            elif opt.fixed_target_length == "forward_backward_encoding":
                self.fixed_target_length = 3
                print('Forward backward encoding')
            elif opt.fixed_target_length == "no":
                print('No fixed target len.')
            else:
                raise NotImplementedError

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        else:
            raise NotImplementedError
        # elif opt.time == 'gru':
        #     self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
        # elif opt.time == 'lstm':
        #     self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        # self.feat_lut = feature_embedding

        # if self.feat_lut is not None:
        #     self.enable_feature = True
        #     self.feature_projector = nn.Linear(opt.model_size * 2, opt.model_size)
        # else:
        self.enable_feature = False

        self.positional_encoder = positional_encoder

        if self.fixed_target_length == 1:
            self.length_lut = nn.Embedding(8192,
                                           opt.model_size,
                                           padding_idx=onmt.Constants.PAD)
            self.length_projector = nn.Linear(opt.model_size * 2,
                                              opt.model_size)

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)

        self.build_modules()
Beispiel #27
0
    def __init__(self,
                 opt,
                 vec_linear,
                 positional_encoder,
                 encoder_type='text'):

        super(TransformerEncoder, self).__init__()

        # # by me
        # assert bert_embeddings is not None

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1:
            self.layers = opt.encoder_layers
        else:
            self.layers = opt.layers
        self.dropout = opt.hidden_dropout
        # src对应的worddropout 在bert里面
        # self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.enc_emb_dropout = opt.enc_emb_dropout
        self.enc_gradient_checkpointing = opt.enc_gradient_checkpointing

        self.time = opt.time
        self.version = opt.version
        self.input_type = encoder_type
        self.cnn_downsampling = opt.cnn_downsampling

        self.switchout = opt.switchout
        self.varitional_dropout = opt.variational_dropout
        self.fp16 = opt.fp16

        # disable word dropout when switch out is in action
        # if self.switchout > 0.0:
        #     self.word_dropout = 0.0

        feature_size = opt.input_size
        self.channels = 1  # n. audio channels

        if opt.upsampling:
            feature_size = feature_size // 4

        if encoder_type != "text":
            if not self.cnn_downsampling:
                self.audio_trans = nn.Linear(feature_size, self.model_size)
                torch.nn.init.xavier_uniform_(self.audio_trans.weight)
            else:
                channels = self.channels
                cnn = [
                    nn.Conv2d(channels, 32, kernel_size=(3, 3), stride=2),
                    nn.ReLU(True),
                    nn.BatchNorm2d(32),
                    nn.Conv2d(32, 32, kernel_size=(3, 3), stride=2),
                    nn.ReLU(True),
                    nn.BatchNorm2d(32)
                ]

                feat_size = (((feature_size // channels) - 3) // 4) * 32
                # cnn.append()
                self.audio_trans = nn.Sequential(*cnn)
                self.linear_trans = nn.Linear(feat_size, self.model_size)
                # assert self.model_size == feat_size, \
                #     "The model dimension doesn't match with the feature dim, expecting %d " % feat_size
        else:
            self.word_lut = None  # 【4*768, model_size】
            self.vec_linear = vec_linear  # 【bert_hidden_size, transformer_model_size】

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        self.preprocess_layer = PrePostProcessing(
            self.model_size,
            self.enc_emb_dropout,
            sequence='d',
            variational=self.varitional_dropout)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.build_modules()
Beispiel #28
0
    def __init__(self, opt, dicts, positional_encoder, encoder_type):

        super(TransformerEncoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1:
            self.layers = opt.encoder_layers
        else:
            self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.input_type = encoder_type

        # input lookup table
        if encoder_type != "text":
            self.audio_trans = nn.Linear(dicts, self.model_size)
        else:
            self.word_lut = nn.Embedding(dicts.size(),
                                         self.model_size,
                                         padding_idx=onmt.Constants.PAD)

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.limit_rhs_steps = opt.limit_rhs_steps

        self.build_modules(limit_rhs_steps=opt.limit_rhs_steps)
        if self.limit_rhs_steps is not None:
            largest_rhs_mask = positional_encoder.len_max + self.limit_rhs_steps
            rhs_mask = torch.BoolTensor(
                np.triu(np.ones((largest_rhs_mask, largest_rhs_mask)),
                        k=1 + self.limit_rhs_steps).astype('uint8'))
            self.register_buffer('rhs_mask', rhs_mask)

        if opt.freeze_encoder:
            for p in self.parameters():
                p.requires_grad = False
                print(p.requires_grad)