Beispiel #1
0
    def __init__(
        self,
        n_heads,
        d_model,
        dropout_rate=0.0,
        skip_term_b=False,
        share_qvk_proj=False,
    ):
        super(MultiHeadedSelfAttentionWithRelPos, self).__init__(
            n_heads, d_model, dropout_rate, share_qvk_proj
        )

        self.d_model = d_model
        self.share_qvk_proj = share_qvk_proj
        self.skip_term_b = skip_term_b
        self.nheads = n_heads
        self.d_k = d_model // n_heads

        self.qvk_proj = nn.Linear(
            d_model, d_model if self.share_qvk_proj else d_model * 3
        )

        self.pos_proj = nn.Linear(d_model, d_model, bias=False)

        self.posu = nn.Parameter(flow.Tensor(1, 1, n_heads, self.d_k))
        self.posv = nn.Parameter(flow.Tensor(1, 1, n_heads, self.d_k))
Beispiel #2
0
 def __init__(self, features, eps=1e-6):
     super(LayerNorm, self).__init__()
     self.eps = eps
     self.weight = nn.Parameter(
         flow.Tensor(flow.ones(features, dtype=flow.float32)))
     self.bias = nn.Parameter(
         flow.Tensor(flow.zeros(features, dtype=flow.float32)))
Beispiel #3
0
 def __init__(self, input_sz, hidden_sz):
     super().__init__()
     self.input_sz = input_sz
     self.hidden_size = hidden_sz
     self.W = nn.Parameter(flow.Tensor(input_sz, hidden_sz * 4))
     self.U = nn.Parameter(flow.Tensor(hidden_sz, hidden_sz * 4))
     self.bias = nn.Parameter(flow.Tensor(hidden_sz * 4))
     self.init_weights()
Beispiel #4
0
 def __init__(self, dim, eps=1e-05, elementwise_affine=True):
     super(GlobalChannelLayerNorm, self).__init__()
     self.eps = eps
     self.normalized_dim = dim
     self.elementwise_affine = elementwise_affine
     if elementwise_affine:
         self.beta = nn.Parameter(flow.zeros(dim, 1))
         self.gamma = nn.Parameter(flow.ones(dim, 1))
     else:
         self.register_parameter("weight", None)
         self.register_parameter("bias", None)
Beispiel #5
0
    def __init__(self, input_size, hidden_size):
        super().__init__()
        low, upper = -sqrt(1 / hidden_size), sqrt(1 / hidden_size)

        self.input_size = input_size
        self.hidden_size = hidden_size

        self.inp_W = nn.Parameter(flow.Tensor(input_size, hidden_size * 3))
        self.hid_W = nn.Parameter(flow.Tensor(hidden_size, hidden_size * 3))
        self.inp_b = nn.Parameter(flow.Tensor(hidden_size * 3))
        self.hid_b = nn.Parameter(flow.Tensor(hidden_size * 3))

        self.init_weight(low, upper)
def _test_convtranspose1d_bias_true(test_case, device):
    np_arr = np.array([[[0.54925832, -0.64144184, 0.15213189]]])
    weight = np.ones((1, 2, 3))
    bias = np.array([0.16849578, 0.1509564])
    test_out_data = np.array(
        [
            [
                [0.71775407, 0.07631224, 0.22844413, -0.32081416, 0.32062766],
                [0.7002147, 0.05877288, 0.21090476, -0.3383535, 0.3030883],
            ]
        ]
    )
    test_out_grad = np.array([[[6.0, 6.0, 6.0]]])

    input_flow = flow.tensor(
        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
    )
    m_f = nn.ConvTranspose1d(1, 2, 3, stride=1, bias=True)
    m_f.weight.data = flow.tensor(weight, dtype=flow.float32)
    m_f.bias = nn.Parameter(flow.Tensor(bias))
    m_f = m_f.to(device)
    out_flow = m_f(input_flow)
    test_case.assertTrue(np.allclose(out_flow.numpy(), test_out_data, 1e-06, 1e-06))
    out_flow = out_flow.sum()
    out_flow.backward()
    test_case.assertTrue(
        np.allclose(input_flow.grad.numpy(), test_out_grad, 1e-06, 1e-06)
    )
Beispiel #7
0
 def __init__(self, num_patches, emb_dim, dropout_rate=0.1):
     super(PositionEmbs, self).__init__()
     self.pos_embedding = nn.Parameter(
         flow.tensor(np.random.randn(1, num_patches + 1, emb_dim),
                     dtype=flow.float32))
     if dropout_rate > 0:
         self.dropout = nn.Dropout(dropout_rate)
     else:
         self.dropout = None
Beispiel #8
0
    def __init__(self, hidden_size, vocab_size, hidden_act=nn.GELU()):
        super().__init__()
        self.hidden_size = hidden_size

        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(hidden_size, vocab_size, bias=False)

        self.output_bias = nn.Parameter(flow.zeros(vocab_size))

        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.output_bias
Beispiel #9
0
    def __init__(
        self, model, input_size, output_size, num_experts, noisy_gating=True, k=4
    ):
        super(MoE, self).__init__()
        self.noisy_gating = noisy_gating
        self.num_experts = num_experts
        self.output_size = output_size
        self.input_size = input_size
        self.k = k

        # instantiate experts
        self.experts = nn.ModuleList([model for i in range(self.num_experts)])

        self.w_gate = nn.Parameter(
            flow.zeros(input_size, num_experts), requires_grad=True
        )
        self.w_noise = nn.Parameter(
            flow.zeros(input_size, num_experts), requires_grad=True
        )

        self.softplus = nn.Softplus()
        self.softmax = nn.Softmax(1)

        assert self.k <= self.num_experts
Beispiel #10
0
    def __init__(self, emb_dim, scale_learnable=False, dropout=0.0):
        """Initialize class.

        :param int d_model: embedding dim
        :param float dropout_rate: dropout rate
        :param int max_len: maximum input length

        """
        super(PositionalEncoding, self).__init__()
        self.emb_dim = emb_dim
        self.xscale = math.sqrt(self.emb_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.scale_learnable = scale_learnable

        if self.scale_learnable:
            self.alpha = nn.Parameter(flow.tensor(1.0))
Beispiel #11
0
    def __init__(
        self,
        dim,
        window_size,
        num_heads,
        qkv_bias=True,
        qk_scale=None,
        attn_drop=0.0,
        proj_drop=0.0,
    ):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        # Author zzk: we add trunc normal here!
        self.relative_position_bias_table = nn.Parameter(
            flow.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                       num_heads))  # 2*Wh-1 * 2*Ww-1, nH
        self.relative_position_bias_table.trunc_normal_(std=0.02)

        # get pair-wise relative position index for each token inside the window
        coords_h = flow.arange(self.window_size[0])
        coords_w = flow.arange(self.window_size[1])
        coords = flow.stack(flow.meshgrid(*[coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = flow.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = (coords_flatten[:, :, None] -
                           coords_flatten[:, None, :])  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0)  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :,
                        0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer("relative_position_index",
                             relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        self.softmax = nn.Softmax(dim=-1)
def _test_convtranspose1d_group_bias_true(test_case, device):
    np_arr = np.array(
        [
            [
                [-0.77808793, 0.99824008, 0.57340066],
                [1.46278707, -0.65234252, -1.13087643],
            ],
            [
                [0.76053973, 0.62332447, -1.17157106],
                [0.60291466, -0.0472167, 0.89986403],
            ],
        ]
    )
    weight = np.ones((2, 1, 3))
    bias = np.array([0.32546719, 0.14995032])
    test_out_data = np.array(
        [
            [
                [-0.45262071, 0.54561937, 1.11902, 1.897108, 0.89886785],
                [1.6127374, 0.96039486, -0.1704815, -1.6332686, -0.9809261],
            ],
            [
                [1.0860069, 1.7093314, 0.5377604, -0.22277936, -0.8461038],
                [0.75286496, 0.70564824, 1.6055121, 1.0025976, 1.0498143],
            ],
        ]
    )
    test_out_grad = np.array(
        [[[3.0, 3.0, 3.0], [3.0, 3.0, 3.0]], [[3.0, 3.0, 3.0], [3.0, 3.0, 3.0]]]
    )
    input_flow = flow.tensor(
        np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True
    )
    m_f = nn.ConvTranspose1d(2, 2, 3, stride=1, groups=2, bias=True)
    m_f.weight.data = flow.tensor(weight, dtype=flow.float32)
    m_f.bias = nn.Parameter(flow.Tensor(bias))
    m_f = m_f.to(device)
    out_flow = m_f(input_flow)
    test_case.assertTrue(np.allclose(out_flow.numpy(), test_out_data, 1e-06, 1e-06))
    out_flow = out_flow.sum()
    out_flow.backward()
    test_case.assertTrue(
        np.allclose(input_flow.grad.numpy(), test_out_grad, 1e-06, 1e-06)
    )
Beispiel #13
0
    def __init__(
        self,
        image_size=(256, 256),
        patch_size=(16, 16),
        emb_dim=768,
        mlp_dim=3072,
        num_heads=12,
        num_layers=12,
        num_classes=1000,
        attn_dropout_rate=0.0,
        dropout_rate=0.1,
        feat_dim=None,
    ):
        super(VisionTransformer, self).__init__()
        h, w = image_size
        # embedding layer
        fh, fw = patch_size
        gh, gw = h // fh, w // fw
        num_patches = gh * gw
        self.embedding = nn.Conv2d(3,
                                   emb_dim,
                                   kernel_size=(fh, fw),
                                   stride=(fh, fw))
        # class token
        self.cls_token = nn.Parameter(flow.zeros(1, 1, emb_dim))

        # transformer
        self.transformer = Encoder(
            num_patches=num_patches,
            emb_dim=emb_dim,
            mlp_dim=mlp_dim,
            num_layers=num_layers,
            num_heads=num_heads,
            dropout_rate=dropout_rate,
            attn_dropout_rate=attn_dropout_rate,
        )

        # classfier
        self.classifier = nn.Linear(emb_dim, num_classes)
Beispiel #14
0
 def __init__(self, features, eps=1e-6):
     super(LayerNorm, self).__init__()
     self.gamma = nn.Parameter(flow.ones(features))
     self.beta = nn.Parameter(flow.zeros(features))
     self.eps = eps
Beispiel #15
0
 def __init__(self, nf, nx):
     super(Conv1D, self).__init__()
     self.nf = nf
     self.weight = nn.Parameter(flow.Tensor(nx, nf))
     nn.init.normal_(self.weight, mean=0, std=0.02)
     self.bias = nn.Parameter(flow.zeros(nf))
Beispiel #16
0
 def __init__(self, hidden_size, eps=1e-6):
     super(LayerNorm, self).__init__()
     self.eps = eps
     self.weight = nn.Parameter(flow.ones(hidden_size, dtype=flow.float32))
     self.bias = nn.Parameter(flow.zeros(hidden_size, dtype=flow.float32))
Beispiel #17
0
 def _load_of_weight(param: flow.nn.Parameter, data: np.ndarray):
     assert param.shape == data.shape
     param.copy_(nn.Parameter(flow.tensor(data, dtype=flow.float32)))
Beispiel #18
0
    def __init__(
        self,
        out_channels,
        kernel_size,
        sample_rate=16000,
        in_channels=1,
        stride=1,
        padding=0,
        dilation=1,
        bias=False,
        groups=1,
        min_low_hz=50,
        min_band_hz=50,
    ):

        super(SincConv_fast, self).__init__()

        if in_channels != 1:
            msg = (
                "SincConv only support one input channel (here, in_channels = {%i})"
                % (in_channels)
            )
            raise ValueError(msg)

        self.out_channels = out_channels
        self.kernel_size = kernel_size

        # Forcing the filters to be odd (i.e, perfectly symmetrics)
        if kernel_size % 2 == 0:
            self.kernel_size = self.kernel_size + 1

        self.stride = stride
        self.padding = padding
        self.dilation = dilation

        if bias:
            raise ValueError("SincConv does not support bias.")
        if groups > 1:
            raise ValueError("SincConv does not support groups.")

        self.sample_rate = sample_rate
        self.min_low_hz = min_low_hz
        self.min_band_hz = min_band_hz

        # initialize filterbanks such that they are equally spaced in Mel scale
        low_hz = 30
        high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz)

        mel = np.linspace(
            self.to_mel(low_hz), self.to_mel(high_hz), self.out_channels + 1
        )
        hz = self.to_hz(mel)

        # filter lower frequency (out_channels, 1)
        self.low_hz_ = nn.Parameter(flow.Tensor(hz[:-1]).reshape(-1, 1))

        # filter frequency band (out_channels, 1)
        self.band_hz_ = nn.Parameter(flow.Tensor(np.diff(hz)).reshape(-1, 1))

        # Hamming window
        n_lin = flow.Tensor(
            np.linspace(0, (self.kernel_size / 2) - 1, int((self.kernel_size / 2)))
        )
        self.window_ = 0.54 - 0.46 * flow.cos(2 * math.pi * n_lin / self.kernel_size)

        # (1, kernel_size/2)
        n = (self.kernel_size - 1) / 2.0
        self.n_ = (
            2
            * math.pi
            * flow.Tensor(np.arange(-n, 0).reshape(1, -1) / self.sample_rate)
        )
Beispiel #19
0
    def __init__(self, options):
        super(MLP, self).__init__()

        self.input_dim = int(options["input_dim"])
        self.fc_lay = options["fc_lay"]
        self.fc_drop = options["fc_drop"]
        self.fc_use_batchnorm = options["fc_use_batchnorm"]
        self.fc_use_laynorm = options["fc_use_laynorm"]
        self.fc_use_laynorm_inp = options["fc_use_laynorm_inp"]
        self.fc_use_batchnorm_inp = options["fc_use_batchnorm_inp"]
        self.fc_act = options["fc_act"]

        self.wx = nn.ModuleList([])
        self.bn = nn.ModuleList([])
        self.ln = nn.ModuleList([])
        self.act = nn.ModuleList([])
        self.drop = nn.ModuleList([])

        # input layer normalization
        if self.fc_use_laynorm_inp:
            self.ln0 = LayerNorm(self.input_dim)

        # input batch normalization
        if self.fc_use_batchnorm_inp:
            self.bn0 = nn.BatchNorm1d([self.input_dim], momentum=0.05)

        self.N_fc_lay = len(self.fc_lay)

        current_input = self.input_dim

        # Initialization of hidden layers
        for i in range(self.N_fc_lay):

            # dropout
            self.drop.append(nn.Dropout(p=self.fc_drop[i]))

            # activation
            self.act.append(act_fun(self.fc_act[i]))

            add_bias = True

            # layer norm initialization
            self.ln.append(LayerNorm(self.fc_lay[i]))
            self.bn.append(nn.BatchNorm1d(self.fc_lay[i], momentum=0.05))

            if self.fc_use_laynorm[i] or self.fc_use_batchnorm[i]:
                add_bias = False

            # Linear operations
            self.wx.append(nn.Linear(current_input, self.fc_lay[i], bias=add_bias))

            # weight initialization
            self.wx[i].weight = nn.Parameter(
                flow.Tensor(self.fc_lay[i], current_input).uniform_(
                    -np.sqrt(0.01 / (current_input + self.fc_lay[i])),
                    np.sqrt(0.01 / (current_input + self.fc_lay[i])),
                )
            )
            self.wx[i].bias = nn.Parameter(flow.zeros(self.fc_lay[i]))

            current_input = self.fc_lay[i]
Beispiel #20
0
    def __init__(
        self,
        img_size=224,
        patch_size=4,
        in_chans=3,
        num_classes=1000,
        embed_dim=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        mlp_ratio=4.0,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.1,
        norm_layer=nn.LayerNorm,
        ape=False,
        patch_norm=True,
        use_checkpoint=False,
        **kwargs,
    ):
        super().__init__()

        self.num_classes = num_classes
        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.ape = ape
        self.patch_norm = patch_norm
        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
        self.mlp_ratio = mlp_ratio

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None,
        )
        num_patches = self.patch_embed.num_patches
        patches_resolution = self.patch_embed.patches_resolution
        self.patches_resolution = patches_resolution

        # absolute position embedding
        if self.ape:
            self.absolute_pos_embed = nn.Parameter(
                flow.zeros(1, num_patches, embed_dim))
            # trunc_normal_(self.absolute_pos_embed, std=.02)
            self.absolute_pos_embed.trunc_normal_(std=0.02)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth
        # dpr = [x.item() for x in flow.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
        # TODO: here we use numpy, may have little difference with torch.linspace
        dpr = [x for x in np.linspace(0, drop_path_rate, sum(depths))
               ]  # stochastic depth decay rule

        # build layers
        self.layers = nn.ModuleList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2**i_layer),
                input_resolution=(
                    patches_resolution[0] // (2**i_layer),
                    patches_resolution[1] // (2**i_layer),
                ),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=self.mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchMerging if
                (i_layer < self.num_layers - 1) else None,
                use_checkpoint=use_checkpoint,
            )
            self.layers.append(layer)

        self.norm = norm_layer(self.num_features)
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.head = (nn.Linear(self.num_features, num_classes)
                     if num_classes > 0 else nn.Identity())

        self.apply(self._init_weights)
Beispiel #21
0
 def __init__(self):
     super().__init__()
     self.x = nn.Parameter(flow.tensor([2, 2], dtype=flow.float32), False)
     self.y = nn.Parameter(flow.tensor([3, 3], dtype=flow.float32), False)