def build_modules(self): e_length = expected_length(self.layers, self.death_rate) if self.reversible: print( "* Reversible Transformer Decoder with Absolute Attention with %.2f expected layers" % e_length) else: print( "* Transformer Decoder with Absolute Attention with %.2f expected layers" % e_length) for _l in range(self.layers): # linearly decay the death rate death_r = (_l + 1.0) / self.layers * self.death_rate if not self.reversible: # block = DecoderLayer(self.n_heads, self.model_size, # self.dropout, self.inner_size, self.attn_dropout, # variational=self.variational_dropout, death_rate=death_r) block = DecoderLayer(self.opt, death_rate=death_r) else: block = ReversibleTransformerDecoderLayer(self.opt, death_rate=_l) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) self.opt.ignore_source = self.ignore_source if self.reversible: print( "* Transformer Reversible Decoder with Relative Attention with %.2f expected layers" % e_length) else: print( "* Transformer Decoder with Relative Attention with %.2f expected layers" % e_length) self.layer_modules = nn.ModuleList() for l in range(self.layers): # linearly decay the death rate death_r = (l + 1.0) / self.layers * self.death_rate if not self.reversible: block = RelativeTransformerDecoderLayer(self.opt, death_rate=death_r) else: block = ReversibleTransformerDecoderLayer(self.opt) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) if self.reversible: print( "* Reversible Transformer Encoder with Absolute Attention with %.2f expected layers" % e_length) else: print( "* Transformer Encoder with Absolute Attention with %.2f expected layers" % e_length) for _l in range(self.layers): # linearly decay the death rate death_r = (_l + 1.0) / self.layers * self.death_rate if not self.lsh_src_attention: if not self.reversible: block = EncoderLayer(self.opt, death_rate=death_r) else: block = ReversibleTransformerEncoderLayer( self.opt, death_rate=death_r) else: from onmt.models.reformer import ReformerEncoderLayer block = ReformerEncoderLayer(self.opt, death_rate=death_r) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, 0.0) self.opt.ignore_source = self.ignore_source opt = self.opt print( "* Speech Transformer Decoder with Relative Attention with %.2f layers" % e_length) self.layer_modules = nn.ModuleList() for l in range(self.layers): # linearly decay the death rate death_r = 0.0 from .relative_transformer_layers import LIDFeedForward lid_network = LIDFeedForward(opt.model_size, 2 * opt.model_size, opt.bottleneck_size, opt.n_languages, dropout=opt.dropout) block = RelativeTransformerDecoderLayer(self.opt, death_rate=death_r, lid_net=lid_network) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) print("* Universal Transformer Decoder with Absolute Attention with %.2f expected layers" % e_length) self.universal_layer = UniversalDecoderLayer(self.opt, death_rate=self.death_rate)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) print("* Relative Translation Encoder with %.2f expected layers" % e_length) self.layer_modules = nn.ModuleList() for _l in range(self.layers): # linearly decay the death rate death_r = (_l + 1.0) / self.layers * self.death_rate block = RelativeTransformerEncoderLayer(self.opt, death_rate=death_r) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) print("* Transformer Decoder with Absolute Attention with %.2f expected layers" % e_length) self.layer_modules = nn.ModuleList() for l in range(self.layers): # linearly decay the death rate death_r = (l + 1.0) / self.layers * self.death_rate block = DecoderLayer(opt, death_rate=death_r) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) print("* Conformer Encoder with %.2f expected layers" % e_length) if self.unidirectional: print("* Running a unidirectional Encoder.") self.layer_modules = nn.ModuleList() for _l in range(self.layers): # linearly decay the death rate death_r = (_l + 1.0) / self.layers * self.death_rate block = ConformerEncoderLayer(self.opt, death_rate=death_r) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) print("* Transformer LM Decoder with Relative Attention with %.2f expected layers" % e_length) self.layer_modules = nn.ModuleList() for l in range(self.layers): # linearly decay the death rate death_r = (l + 1.0) / self.layers * self.death_rate block = TransformerXLDecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout, ignore_source=True, variational=self.variational_dropout, death_rate=death_r) self.layer_modules.append(block)
def build_modules(self): assert self.opt.src_reversible == False e_length = expected_length(self.layers, self.death_rate) print( "* Bayes-By-Backprop Relative Transformer Encoder with %.2f expected layers" % e_length) if self.unidirectional: print("* Running a unidirectional Encoder.") self.layer_modules = nn.ModuleList() for _l in range(self.layers): # linearly decay the death rate death_r = (_l + 1.0) / self.layers * self.death_rate block = TransformerEncoderLayer(self.opt, death_rate=death_r) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) self.opt.ignore_source = self.ignore_source print( "* Bayes-By-Backprop Relative Transformer Decoder with %.2f expected layers" % e_length) self.layer_modules = nn.ModuleList() for l in range(self.layers): # linearly decay the death rate death_r = (l + 1.0) / self.layers * self.death_rate block = TransformerDecoderLayer(self.opt, death_rate=death_r) self.layer_modules.append(block)
def build_modules(self): self.death_rate = 0.0 e_length = expected_length(self.layers, self.death_rate) self.opt.ignore_source = self.ignore_source opt = self.opt print("* Speech Transformer Decoder with Relative Attention with %.2f layers" % e_length) self.layer_modules = nn.ModuleList() for _l in range(self.layers): # linearly decay the death rate death_r = (_l + 1.0) / self.layers * self.death_rate block = RelativeTransformerDecoderLayer(self.opt, death_rate=death_r) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) print( "* Transformer Encoder with Absolute Attention with %.2f expected layers" % e_length) for _l in range(self.layers): # linearly decay the death rate death_r = (_l + 1.0) / self.layers * self.death_rate block = EncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout, variational=self.varitional_dropout, death_rate=death_r) self.layer_modules.append(block)
def build_modules(self): e_length = expected_length(self.layers, self.death_rate) if self.reversible: print("* Reversible Encoder with Relative Attention with %.2f expected layers" % e_length) else: print("* Transformer Encoder with Relative Attention with %.2f expected layers" % e_length) if self.unidirectional: print("* Running a unidirectional Encoder.") self.layer_modules = nn.ModuleList() for _l in range(self.layers): # linearly decay the death rate death_r = (_l + 1.0) / self.layers * self.death_rate if not self.reversible: block = RelativeTransformerEncoderLayer(self.opt, death_rate=death_r) else: block = ReversibleTransformerEncoderLayer(self.opt, death_rate=death_r) self.layer_modules.append(block)
def __init__(self, opt, dicts, positional_encoder, encoder_type='text'): self.death_rate = opt.death_rate self.double_position = opt.double_position self.max_pos_length = opt.max_pos_length self.layer_modules = list() # build_modules will be called from the inherited constructor super(RelativeTransformerEncoder, self).__init__(opt, dicts, positional_encoder, encoder_type) print("Encoder type: %s", encoder_type) # self.positional_encoder = SinusoidalPositionalEmbedding(opt.model_size) # embedding for the positions # 2N - 1 positions because it runs from -N -> 0 -> N self.positional_encoder = nn.Embedding(2 * self.max_pos_length + 1, self.model_size) self.d_head = self.model_size // self.n_heads e_length = expected_length(self.layers, self.death_rate) print( "* Transformer Encoder with Relative Attention with %.2f expected layers" % e_length)