def __init__( self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0, ): super().__init__() if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number " "of attention heads (%d)" % (hidden_size, num_attention_heads)) self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.attn_head_size = int(hidden_size / num_attention_heads) self.attn_scale = math.sqrt(math.sqrt(self.attn_head_size)) self.query_net = nn.Linear(hidden_size, hidden_size) self.key_net = nn.Linear(hidden_size, hidden_size) self.value_net = nn.Linear(hidden_size, hidden_size) self.out_projection = nn.Linear(hidden_size, hidden_size) self.attn_dropout = nn.Dropout(attn_score_dropout) self.layer_dropout = nn.Dropout(attn_layer_dropout) self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5)
def __init__(self, vocab_size, embedding_size, hidden_size, max_sequence_length=512, num_token_types=2, embedding_dropout=0.0, learn_positional_encodings=False): super().__init__() self.max_sequence_length = max_sequence_length self.token_embedding = nn.Embedding( vocab_size, embedding_size, padding_idx=0) if embedding_size == hidden_size: self.encode_ids_fn = lambda x: self.token_embedding(x) else: self.token2hidden = nn.Linear( embedding_size, hidden_size, bias=False) self.encode_ids_fn = \ lambda x: self.token2hidden(self.token_embedding(x)) if learn_positional_encodings: self.position_embedding = nn.Embedding( max_sequence_length, hidden_size) else: self.position_embedding = FixedPositionalEncoding( hidden_size, max_sequence_length) self.token_type_embedding = nn.Embedding(num_token_types, hidden_size) self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5) self.dropout = nn.Dropout(embedding_dropout)
def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True): if torch.cuda.is_available(): try: from apex.normalization import FusedLayerNorm return FusedLayerNorm(normalized_shape, eps, elementwise_affine) except ImportError: pass return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
def __init__(self, hidden_size, inner_size, ffn_dropout=0.0, hidden_act="relu"): super().__init__() self.dense_in = nn.Linear(hidden_size, inner_size) self.dense_out = nn.Linear(inner_size, hidden_size) self.layer_dropout = nn.Dropout(ffn_dropout) self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5) ACT2FN = {"gelu": gelu, "relu": torch.relu} self.act_fn = ACT2FN[hidden_act]
def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False, args=None): if args is not None: if args.lnv != 'origin': return LayerNormImpl(args, normalized_shape, eps, elementwise_affine) if not export and torch.cuda.is_available(): try: from apex.normalization import FusedLayerNorm return FusedLayerNorm(normalized_shape, eps, elementwise_affine) except ImportError: pass return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
def __init__(self, hidden_size, num_attention_heads, kernel_size, conv_weight_dropout=0.0, conv_layer_dropout=0.0): super().__init__() self.num_heads = num_attention_heads self.kernel_size = kernel_size self.weight = nn.Parameter(torch.Tensor(num_attention_heads, 1, kernel_size)) self.in_projection = nn.Linear(hidden_size, hidden_size) self.out_projection = nn.Linear(hidden_size, hidden_size) self.conv_weight_dropout = nn.Dropout(conv_weight_dropout) self.conv_layer_dropout = nn.Dropout(conv_layer_dropout) self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5)
def __init__(self, config): super(Encoder, self).__init__() self.att_heads = config.num_attention_heads # self.initializer = Initializer(config) # layer = EncoderLayer(config) # self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) # self.layer = nn.ModuleList([layer]) # self.conv = FastRGCNConv(config.hidden_size,config.hidden_size) # self.conv3 = FastRGCNConv(config.hidden_size,config.hidden_size,25,num_bases=128) # self.ctoq = MultiHeadedAttention(self.att_heads,config.hidden_size) self.qtoc = MultiHeadedAttention(self.att_heads,config.hidden_size) self.uttAtt = MaskMultiHeadedAttention(self.att_heads,config.hidden_size) # self.rnn = torch.nn.LSTM(config.hidden_size,config.hidden_size // 2,dropout=0.4, # bidirectional=True, num_layers=2, batch_first=True) self.gelu = torch.nn.functional.gelu # self.conv3 = RGCNConv(config.hidden_size, config.hidden_size, 35, num_bases=30) # self.conv2 = torch.nn.ModuleList() # for i in range(2): # self.conv2.append( # DNAConv(config.hidden_size,self.att_heads,1,0.4)) # self.conv3 = torch.nn.ModuleList() # for i in range(2): # self.conv3.append( # DNAConv(config.hidden_size,self.att_heads,1,0,0.4)) # self.conv = GraphConv(config.hidden_size, config.hidden_size,'max') # self.lineSub = torch.nn.Linear(config.hidden_size*3,config.hidden_size) # self.lineSub = torch.nn.Linear(config.hidden_size*2,config.hidden_size) #self.lineSub = torch.nn.Linear(config.hidden_size*2,config.hidden_size) self.hidden_size = config.hidden_size self.config = config self.dropout = nn.Dropout(0.1) self.fuseLayerNorm = FusedLayerNorm(config.hidden_size) # self.dropout = nn.Dropout(0.3) seems to high self.TopNet = nn.ModuleList([getMaxScore2(self.hidden_size) for _ in range(1)]) self.TopNet[0].ql = self.qtoc.linears[0] self.TopNet[0].kl = self.qtoc.linears[1] # self.BoudSelect = nn.ModlueList([getThresScore(self.hidden_size) for _ in range(3)]) self.dnaAct = torch.relu
def get_norm_layer(name, out_features, num_groups=1, eps=1e-5, affine=True): if name == 'gn' and num_groups == 1: name = 'bn' if name == 'bn': return BatchNorm(num_features=out_features, eps=eps, affine=affine) elif name == 'ln': try: from apex.normalization import FusedLayerNorm return FusedLayerNorm(out_features, eps, affine) except: return nn.LayerNorm(out_features, eps=eps, elementwise_affine=affine) elif name == 'gn': return nn.GroupNorm(num_groups=num_groups, num_channels=out_features, eps=eps, affine=affine) else: print_error_message('Supported normalization functions: {}'.format(norm_layer_list)) return None
import math import torch from torch import nn from nemo import logging from nemo.collections.nlp.utils.functional_utils import gelu __all__ = [] try: from apex.normalization import FusedLayerNorm # Try to use FusedLayerNorm from Apex - this will trigger an error. _ = FusedLayerNorm(8, eps=1e-5) except Exception as e: logging.warning( "Unable to import FusedLayerNorm from APEX. Using regular LayerNorm instead." ) from torch.nn import LayerNorm as FusedLayerNorm class FixedPositionalEncoding(nn.Module): """ Fixed positional encoding (embedding layer) from sine and cosine functions of different frequencies according to https://arxiv.org/abs/1706.03762 Args: hidden_size: size of the embeddings in the model, also known as d_model
import torch import fused_layer_norm_cuda from apex.normalization import FusedLayerNorm import pyprof2 pyprof2.init() pyprof2.wrap(fused_layer_norm_cuda, 'forward') pyprof2.wrap(fused_layer_norm_cuda, 'backward') pyprof2.wrap(fused_layer_norm_cuda, 'forward_affine') pyprof2.wrap(fused_layer_norm_cuda, 'backward_affine') input = torch.randn(20, 5, 10, 10).cuda() # With Learnable Parameters m = FusedLayerNorm(input.size()[1:]).cuda() output = m(input) # Without Learnable Parameters m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda() output = m(input) # Normalize over last two dimensions m = FusedLayerNorm([10, 10]).cuda() output = m(input) # Normalize over last dimension of size 10 m = FusedLayerNorm(10).cuda() output = m(input)