def __init__(self, config, output_attentions=False, keep_multihead_output=False): super(TransformerEncoder, self).__init__() self.output_attentions = output_attentions self.pre_layer_norm = config.pre_layer_norm layer = TransformerLayer(config, output_attentions=output_attentions, keep_multihead_output=keep_multihead_output) if config.share_layer: self.layer = nn.ModuleList( [layer for _ in range(config.num_hidden_layers)]) else: self.layer = nn.ModuleList([ copy.deepcopy(layer) for _ in range(config.num_hidden_layers) ]) if self.pre_layer_norm: # If pre-LN Transformer, a final layer_norm would be placed after the last layer, # and intermediate layer_norms for all layer embedding outputs LayerNorm = TransformerLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.LayerNorm = nn.ModuleList([ copy.deepcopy(LayerNorm) for _ in range(config.num_hidden_layers) ])
def __init__(self, config): super(TransformerSelfOutput, self).__init__() self.pre_layer_norm = config.pre_layer_norm self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.LayerNorm = TransformerLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def __init__(self, config, input_dim): super(TransformerInputRepresentations, self).__init__() self.hidden_size = config.hidden_size self.spec_transform = nn.Linear(input_dim * config.downsample_rate, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = TransformerLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config, output_dim): super(TransformerSpecPredictionHead, self).__init__() self.output_dim = output_dim self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = TransformerLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.output = nn.Linear(config.hidden_size, self.output_dim * config.downsample_rate)