def test_dropout(): m = nn.Module() x = torch.ones(2, 6, 6, 6) torch.manual_seed(100) y0 = nn.Dropout(0.3)(x) torch.manual_seed(100) y1 = W.dropout(x, 0.3, parent=m) assert torch.equal(y0, y1) torch.manual_seed(100) y0 = nn.Dropout2d(0.3)(x) torch.manual_seed(100) y1 = W.dropout(x, 0.3, by_channel=True, parent=m) assert torch.equal(y0, y1)
def multi_head_attention(x, y=None, num_head=8, dropout=0.1, mask=None, **kw): def split_heads(t): # (B, C, L) -> (B, N, H, L) where N*H == C return t.reshape(batch, num_head, size // num_head, t.shape[-1]) def merge_heads(t): # (B, N, H, L) -> (B, C, L) return t.reshape(batch, -1, t.shape[-1]) # (B, C, L) if y is None: y = x # self attention batch, size = x.shape[:2] # B, C, Lx assert size % num_head == 0, 'num_head must be a divisor of size.' assert y.shape[:2] == x.shape[:2], 'The first 2 dims of x, y must match.' q = W.linear(x, size) # query k = W.linear(y, size) # key v = W.linear(y, size) # value q = split_heads(q) # (B, N, H, Lx) k = split_heads(k) # (B, N, H, Ly) v = split_heads(v) # (B, N, H, Ly) q *= (size // num_head)**(-0.5) a = q.transpose(2, 3).contiguous().matmul( k) # attention weights, (B, N, Lx, Ly) if mask is not None: a += mask a = F.softmax(a, dim=-1) a = W.dropout(a, dropout) x = v.matmul(a.transpose(2, 3).contiguous()) # (B, N, H, Lx) x = merge_heads(x) # (B, C, Lx) return W.linear(x, size)
def forward(self, x): x = conv_bn_act(x, 32, kernel=3, stride=2, name='head') for size, expand, kernel, stride, repeat, se_ratio, dc_ratio in spec_b0: for i in range(repeat): stride = stride if i == 0 else 1 x = mb_block(x, size, expand, kernel, stride, se_ratio, dc_ratio) x = conv_bn_act(x, 1280, name='tail') x = F.adaptive_avg_pool2d(x, 1) x = W.dropout(x, 0.2) x = x.view(x.shape[0], -1) x = W.linear(x, 1000) return x
def classify(x, size, *arg, **kw): x = W.dropout(x, rate=0.2, name='classifier-0') return W.linear(x, size, name='classifier-1')
def residual_add(x, layer, dropout=0.1, **kw): y = W.layer_norm(x) y = layer(y, **kw) y = W.dropout(y, dropout) return x + y
def feed_forward(x, size_ff=2048, dropout=0.1, **kw): y = W.linear(x, size_ff, activation='relu') y = W.dropout(y, dropout) return W.linear(y, x.shape[1])