Esempio n. 1
0
    def __init__(
        self,
        hidden_size,
        num_attention_heads,
        attn_score_dropout=0.0,
        attn_layer_dropout=0.0,
    ):
        super().__init__()
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number "
                "of attention heads (%d)" % (hidden_size, num_attention_heads))
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.attn_head_size = int(hidden_size / num_attention_heads)
        self.attn_scale = math.sqrt(math.sqrt(self.attn_head_size))

        self.query_net = nn.Linear(hidden_size, hidden_size)
        self.key_net = nn.Linear(hidden_size, hidden_size)
        self.value_net = nn.Linear(hidden_size, hidden_size)
        self.out_projection = nn.Linear(hidden_size, hidden_size)

        self.attn_dropout = nn.Dropout(attn_score_dropout)
        self.layer_dropout = nn.Dropout(attn_layer_dropout)
        self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5)
Esempio n. 2
0
    def __init__(self, vocab_size, embedding_size, hidden_size,
                 max_sequence_length=512, num_token_types=2,
                 embedding_dropout=0.0, learn_positional_encodings=False):
        super().__init__()

        self.max_sequence_length = max_sequence_length
        self.token_embedding = nn.Embedding(
            vocab_size, embedding_size, padding_idx=0)

        if embedding_size == hidden_size:
            self.encode_ids_fn = lambda x: self.token_embedding(x)
        else:
            self.token2hidden = nn.Linear(
                embedding_size, hidden_size, bias=False)
            self.encode_ids_fn = \
                lambda x: self.token2hidden(self.token_embedding(x))

        if learn_positional_encodings:
            self.position_embedding = nn.Embedding(
                max_sequence_length, hidden_size)
        else:
            self.position_embedding = FixedPositionalEncoding(
                hidden_size, max_sequence_length)

        self.token_type_embedding = nn.Embedding(num_token_types, hidden_size)
        self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5)
        self.dropout = nn.Dropout(embedding_dropout)
Esempio n. 3
0
def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
    if torch.cuda.is_available():
        try:
            from apex.normalization import FusedLayerNorm
            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
        except ImportError:
            pass
    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
Esempio n. 4
0
 def __init__(self, hidden_size, inner_size, ffn_dropout=0.0, hidden_act="relu"):
     super().__init__()
     self.dense_in = nn.Linear(hidden_size, inner_size)
     self.dense_out = nn.Linear(inner_size, hidden_size)
     self.layer_dropout = nn.Dropout(ffn_dropout)
     self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5)
     ACT2FN = {"gelu": gelu, "relu": torch.relu}
     self.act_fn = ACT2FN[hidden_act]
Esempio n. 5
0
def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False, args=None):
    if args is not None:
        if args.lnv != 'origin':
            return LayerNormImpl(args, normalized_shape, eps, elementwise_affine)
    if not export and torch.cuda.is_available():
        try:
            from apex.normalization import FusedLayerNorm
            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
        except ImportError:
            pass
    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
Esempio n. 6
0
    def __init__(self, hidden_size, num_attention_heads, kernel_size, conv_weight_dropout=0.0, conv_layer_dropout=0.0):
        super().__init__()
        self.num_heads = num_attention_heads
        self.kernel_size = kernel_size
        self.weight = nn.Parameter(torch.Tensor(num_attention_heads, 1, kernel_size))
        self.in_projection = nn.Linear(hidden_size, hidden_size)
        self.out_projection = nn.Linear(hidden_size, hidden_size)

        self.conv_weight_dropout = nn.Dropout(conv_weight_dropout)
        self.conv_layer_dropout = nn.Dropout(conv_layer_dropout)
        self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5)
Esempio n. 7
0
    def __init__(self, config):
        super(Encoder, self).__init__()
        self.att_heads = config.num_attention_heads
#        self.initializer = Initializer(config)
#        layer = EncoderLayer(config)
#        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
#        self.layer = nn.ModuleList([layer])
#        self.conv = FastRGCNConv(config.hidden_size,config.hidden_size)
#        self.conv3 = FastRGCNConv(config.hidden_size,config.hidden_size,25,num_bases=128)
        
        # self.ctoq = MultiHeadedAttention(self.att_heads,config.hidden_size)
        self.qtoc = MultiHeadedAttention(self.att_heads,config.hidden_size)
        self.uttAtt = MaskMultiHeadedAttention(self.att_heads,config.hidden_size)
        
        # self.rnn = torch.nn.LSTM(config.hidden_size,config.hidden_size // 2,dropout=0.4,
        #                          bidirectional=True, num_layers=2, batch_first=True)
        self.gelu = torch.nn.functional.gelu
        
        # self.conv3 = RGCNConv(config.hidden_size, config.hidden_size, 35, num_bases=30)
        # self.conv2 = torch.nn.ModuleList()
        # for i in range(2):
        #     self.conv2.append(
        #             DNAConv(config.hidden_size,self.att_heads,1,0.4))
        # self.conv3 = torch.nn.ModuleList()
        # for i in range(2):
        #     self.conv3.append(
        #         DNAConv(config.hidden_size,self.att_heads,1,0,0.4))
            
        # self.conv = GraphConv(config.hidden_size, config.hidden_size,'max')
            
        # self.lineSub = torch.nn.Linear(config.hidden_size*3,config.hidden_size)
        # self.lineSub = torch.nn.Linear(config.hidden_size*2,config.hidden_size)
        #self.lineSub = torch.nn.Linear(config.hidden_size*2,config.hidden_size)
        
        self.hidden_size = config.hidden_size
        self.config = config
        self.dropout = nn.Dropout(0.1)
        self.fuseLayerNorm = FusedLayerNorm(config.hidden_size)

        # self.dropout = nn.Dropout(0.3) seems to high
        
        self.TopNet = nn.ModuleList([getMaxScore2(self.hidden_size) for _ in range(1)])
        self.TopNet[0].ql = self.qtoc.linears[0]
        self.TopNet[0].kl = self.qtoc.linears[1]
        
        # self.BoudSelect = nn.ModlueList([getThresScore(self.hidden_size) for _ in range(3)])
        self.dnaAct = torch.relu
def get_norm_layer(name, out_features, num_groups=1, eps=1e-5, affine=True):
    if name == 'gn' and num_groups == 1:
        name = 'bn'

    if name == 'bn':
        return BatchNorm(num_features=out_features, eps=eps, affine=affine)
    elif name == 'ln':
        try:
            from apex.normalization import FusedLayerNorm
            return FusedLayerNorm(out_features, eps, affine)
        except:
            return nn.LayerNorm(out_features, eps=eps, elementwise_affine=affine)
    elif name == 'gn':
        return nn.GroupNorm(num_groups=num_groups, num_channels=out_features, eps=eps, affine=affine)
    else:
        print_error_message('Supported normalization functions: {}'.format(norm_layer_list))
        return None
Esempio n. 9
0
import math

import torch
from torch import nn

from nemo import logging
from nemo.collections.nlp.utils.functional_utils import gelu

__all__ = []

try:
    from apex.normalization import FusedLayerNorm

    # Try to use FusedLayerNorm from Apex - this will trigger an error.
    _ = FusedLayerNorm(8, eps=1e-5)

except Exception as e:
    logging.warning(
        "Unable to import FusedLayerNorm  from APEX. Using regular LayerNorm instead."
    )
    from torch.nn import LayerNorm as FusedLayerNorm


class FixedPositionalEncoding(nn.Module):
    """
    Fixed positional encoding (embedding layer) from sine and cosine functions
    of different frequencies according to https://arxiv.org/abs/1706.03762

    Args:
        hidden_size: size of the embeddings in the model, also known as d_model
Esempio n. 10
0
import torch
import fused_layer_norm_cuda
from apex.normalization import FusedLayerNorm
import pyprof2

pyprof2.init()
pyprof2.wrap(fused_layer_norm_cuda, 'forward')
pyprof2.wrap(fused_layer_norm_cuda, 'backward')
pyprof2.wrap(fused_layer_norm_cuda, 'forward_affine')
pyprof2.wrap(fused_layer_norm_cuda, 'backward_affine')

input = torch.randn(20, 5, 10, 10).cuda()

# With Learnable Parameters
m = FusedLayerNorm(input.size()[1:]).cuda()
output = m(input)

# Without Learnable Parameters
m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda()
output = m(input)

# Normalize over last two dimensions
m = FusedLayerNorm([10, 10]).cuda()
output = m(input)

# Normalize over last dimension of size 10
m = FusedLayerNorm(10).cuda()
output = m(input)