def __init__(self, args): super(TransformerEncoder, self).__init__() self.mask = args.mask self.layers_num = args.layers_num self.parameter_sharing = args.parameter_sharing self.factorized_embedding_parameterization = args.factorized_embedding_parameterization self.layernorm_positioning = args.layernorm_positioning self.relative_position_embedding = args.relative_position_embedding self.has_residual_attention = args.has_residual_attention if "deepspeed_checkpoint_activations" in args: self.deepspeed_checkpoint_activations = args.deepspeed_checkpoint_activations self.deepspeed_checkpoint_layers_num = args.deepspeed_checkpoint_layers_num else: self.deepspeed_checkpoint_activations = False has_bias = bool(1 - args.remove_transformer_bias) if self.factorized_embedding_parameterization: self.linear = nn.Linear(args.emb_size, args.hidden_size) if self.parameter_sharing: self.transformer = TransformerLayer(args) else: self.transformer = nn.ModuleList( [TransformerLayer(args) for _ in range(self.layers_num)] ) if self.layernorm_positioning == "pre": if args.layernorm == "t5": self.layer_norm = T5LayerNorm(args.hidden_size) else: self.layer_norm = LayerNorm(args.hidden_size) if self.relative_position_embedding: self.relative_pos_emb = RelativePositionEmbedding(bidirectional=True, heads_num=args.heads_num, num_buckets=args.relative_attention_buckets_num)
def __init__(self, args): super(TransformerEncoder, self).__init__() self.mask = args.mask self.layers_num = args.layers_num self.parameter_sharing = args.parameter_sharing self.factorized_embedding_parameterization = args.factorized_embedding_parameterization self.layernorm_positioning = args.layernorm_positioning self.relative_position_embedding = args.relative_position_embedding has_bias = bool(1 - args.remove_transformer_bias) if self.factorized_embedding_parameterization: self.linear = nn.Linear(args.emb_size, args.hidden_size) if self.parameter_sharing: self.transformer = TransformerLayer(args) else: self.transformer = nn.ModuleList( [TransformerLayer(args) for _ in range(self.layers_num)]) if self.layernorm_positioning == "pre": self.layer_norm = LayerNorm(args.hidden_size, has_bias=has_bias) if self.relative_position_embedding: self.relative_pos_emb = RelativePositionEmbedding( bidirectional=True, heads_num=args.heads_num)
def __init__(self, args): super(BertEncoder, self).__init__() self.layers_num = args.layers_num self.parameter_sharing = args.parameter_sharing self.factorized_embedding_parameterization = args.factorized_embedding_parameterization if self.factorized_embedding_parameterization: self.linear = nn.Linear(args.emb_size, args.hidden_size) if self.parameter_sharing: self.transformer = TransformerLayer(args) else: self.transformer = nn.ModuleList( [TransformerLayer(args) for _ in range(self.layers_num)])
def __init__(self, args): super(TransformerEncoder, self).__init__() self.mask = args.mask self.layers_num = args.layers_num self.parameter_sharing = args.parameter_sharing self.factorized_embedding_parameterization = args.factorized_embedding_parameterization self.layernorm_positioning = args.layernorm_positioning if self.factorized_embedding_parameterization: self.linear = nn.Linear(args.emb_size, args.hidden_size) if self.parameter_sharing: self.transformer = TransformerLayer(args) else: self.transformer = nn.ModuleList( [TransformerLayer(args) for _ in range(self.layers_num)]) if self.layernorm_positioning == "pre": self.layer_norm = LayerNorm(args.hidden_size)
def __init__(self, args): super(BertEncoder, self).__init__() self.layers_num = args.layers_num self.transformer = nn.ModuleList( [TransformerLayer(args) for _ in range(self.layers_num)])
def __init__(self, args): super(AlbertEncoder, self).__init__() self.layers_num = args.layers_num self.linear = nn.Linear(args.emb_size, args.hidden_size, bias = True) self.transformer = TransformerLayer(args)