Beispiel #1
0
    def __init__(self,
                 input_size,
                 hidden_sizes,
                 output_size,
                 act_func='sigmoid',
                 train_alg='batch'):
        """
        Parameters:
        ------------------
        - input_size: integer, the number of features in the input
        - hidden_sizes: a list of integers, a list object containing number of units for hidden layers
        - output_size: an integer, the length of output vector 
        - act_func: string, name of activation function to use for each hidden layer 
        - train_alg: string, allowed values are {'batch', 'reweight', 'naive'}
        """
        super(MLP, self).__init__()

        self.input_size = input_size
        layer_sizes = [input_size] + hidden_sizes
        self.linears = nn.ModuleList([
            Linear(in_size, out_size, bias=True)
            for in_size, out_size in zip(layer_sizes[:-1], layer_sizes[1:])
        ])

        self.output_layer = Linear(hidden_sizes[-1], output_size, bias=True)
        self.act = activation[act_func]
        self.train_alg = train_alg

        # list of layers in the network
        self.layers = [layer for layer in self.linears]
        self.layers.append(self.output_layer)
Beispiel #2
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 pe_grad=True):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)

        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = _get_activation_fn(activation)

        self._pe_modules = [
            self.self_attn, self.linear1, self.linear2, self.norm1, self.norm2
        ]
Beispiel #3
0
    def __init__(self, input_size, hidden_size, num_classes, train_alg='batch'):
        super(type(self), self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = num_classes
        self.train_alg = train_alg

        self.rnn = RNNCell(input_size, hidden_size)
        self.fc = Linear(self.hidden_size, self.output_size)
Beispiel #4
0
class SimpleRNN(PeGradNet):
    def __init__(self, input_size, hidden_size, num_classes, train_alg='batch'):
        super(type(self), self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = num_classes
        self.train_alg = train_alg

        self.rnn = RNNCell(input_size, hidden_size)
        self.fc = Linear(self.hidden_size, self.output_size)


    def forward(self, x):
        x = x.squeeze(1).permute(1, 0, 2)  # seq_len x batch_size x input_size

        self.rnn.reset_pgrad()

        hx = torch.zeros(x.shape[1], self.hidden_size, device=x.device)
        
        for t in range(x.shape[0]):
            hx = self.rnn(x[t], hx)

        logits = self.fc(hx)

        return logits

    def per_example_gradient(self, loss):
        grads = []

        pre_acts = self.rnn.pre_activation
        pre_acts.append(self.fc.pre_activation)
        
        Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True)

        grads.extend(self.rnn.per_example_gradient(Z_grad[:-1]))
        grads.extend(self.fc.per_example_gradient(Z_grad[-1]))

        return grads

    def pe_grad_norm(self, loss, batch_size, device):
        grad_norm = torch.zeros(batch_size, device=device, requires_grad=False)
        
        pre_acts = self.rnn.pre_activation
        pre_acts.append(self.fc.pre_activation)

        Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True)
        grad_norm.add_(self.rnn.pe_grad_sqnorm(Z_grad[:-1]))
        grad_norm.add_(self.fc.pe_grad_sqnorm(Z_grad[-1]))

        grad_norm.sqrt_()

        return grad_norm
Beispiel #5
0
    def __init__(self,
                 input_size,
                 channel_sizes,
                 kernel_sizes,
                 fc_sizes,
                 num_classes,
                 train_alg='batch'):
        super(type(self), self).__init__()

        self.input_size = input_size
        self.kernel_sizes = kernel_sizes
        self.act = F.relu
        self.pooling = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)

        # convolutional layers
        layers = []
        out_size = input_size
        for c_in, c_out, k in zip(channel_sizes[:-1], channel_sizes[1:],
                                  kernel_sizes):
            layer = Conv2d(c_in, c_out, k)
            layers.append(layer)
            out_size = conv_outsize(out_size, k, layer.padding[0],
                                    layer.stride[0])
            out_size = conv_outsize(out_size, self.pooling.kernel_size,
                                    self.pooling.padding, self.pooling.stride)

        self.convs = nn.ModuleList(layers)
        self.conv_outsize = out_size * out_size * c_out

        # fully-connected layers
        fc_sizes = [self.conv_outsize] + fc_sizes
        self.linears = nn.ModuleList([
            Linear(in_size, out_size)
            for in_size, out_size in zip(fc_sizes[:-1], fc_sizes[1:])
        ])
        self.output_layer = Linear(fc_sizes[-1], num_classes)

        self.layers = [layer for layer in self.convs]
        self.layers += [layer for layer in self.linears]
        self.layers.append(self.output_layer)
        self.train_alg = train_alg
Beispiel #6
0
    def __init__(self, n_token, n_classes, d_model=512, n_layers=2,
                 n_head=8, n_hidden=2048, dropout=0.1, max_seq_len=512,
                 embeddings=None, train_alg='batch'):
        super(TransformerModel, self).__init__()

        self.train_alg = train_alg
        self.d_model = d_model
        self.n_head = n_head

        if embeddings is None:            
            self.token_embedding = nn.Embedding(n_token, d_model)
        else:
            self.token_embedding = nn.Embedding.from_pretrained(embeddings)
            self.token_embedding.weight.requires_grad = False

        self.pos_encoder = PositionalEncoding(d_model, dropout, max_seq_len)        
        encoder_layers = TransformerEncoderLayer(d_model, n_head, n_hidden, dropout)
        # encoder_norm = nn.LayerNorm(d_model)
        encoder_norm = None
        self.encoder = TransformerEncoder(encoder_layers, n_layers, encoder_norm)
        self.fc= Linear(d_model, n_classes)
Beispiel #7
0
    def __init__(self, output_size, cfg='A', train_alg='batch',
                 batch_norm=False, pre_trained=False, init_weights=True):
        super(VGG, self).__init__()
        self.layers = []
        self.features = make_layers(cfgs[cfg], self.layers, batch_norm)
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = Sequential(
            Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            Linear(4096, output_size)
        )
        self.layers += [self.classifier[0], self.classifier[3], self.classifier[6]]

        if init_weights:
            self._initialize_weights()

        self.train_alg = train_alg        
Beispiel #8
0
    def __init__(self, input_size, hidden_size, num_classes, num_layers=1,
                 train_alg='batch', bias=True):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        # self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers,
        #                   nonlinearity='tanh')
        self.rnn = RNNModule(input_size, hidden_size)
        self.output_layer = Linear(hidden_size, num_classes)
        self.train_alg = train_alg

        self.layers = [self.rnn, self.output_layer]
Beispiel #9
0
    def __init__(self, block, layers, num_classes=10, zero_init_residual=False,
                 norm_layer=None, train_alg='batch'):
        super(ResNet, self).__init__()

        self.train_alg = train_alg
        self.inplanes = 64
        self.dialation = 1
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.conv1 = Conv2d(3, self.inplanes, kernel_size=6, stride=2, padding=3,
                            bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

        # collecting layers whose per-example gradients need to be computed
        self.layers = [self.conv1]
        add_pegrad_layers(self.layer1, self.layers)
        add_pegrad_layers(self.layer2, self.layers)
        add_pegrad_layers(self.layer3, self.layers)
        add_pegrad_layers(self.layer4, self.layers)
        self.layers.append(self.fc)
Beispiel #10
0
    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 bias=True,
                 add_bias_kv=False,
                 add_zero_attn=False):
        super(MultiheadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        self.in_proj = Linear(embed_dim, 3 * embed_dim)
        # self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))

        # if bias:
        #     self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
        # else:
        #     self.register_parameter('in_proj_bias', None)
        self.out_proj = Linear(embed_dim, embed_dim)

        self._reset_parameters()
Beispiel #11
0
class SimpleLSTM(PeGradNet):
    def __init__(self, input_size, hidden_size, output_size, train_alg='batch'):
        super(SimpleLSTM, self).__init__()
        
        self.lstm = LSTMCell(input_size, hidden_size)
        self.fc = Linear(hidden_size, output_size)
        self.train_alg = train_alg

    def forward(self, x, init_states=None):
        # x = x.squeeze(1)
        batch_size = x.shape[0]
        x = x.reshape(batch_size, x.shape[2], -1)
        seq_size = x.shape[1]
        hidden_size = self.lstm.hidden_size

        self.lstm.reset_pgrad()

        if init_states is None:
            h_t, c_t = (torch.zeros(batch_size, hidden_size, device=x.device), 
                        torch.zeros(batch_size, hidden_size, device=x.device))
        else:
            h_t, c_t = init_states
         
        for t in range(seq_size):
            x_t = x[:, t, :]
            h_t, c_t = self.lstm(x_t, h_t, c_t)

        logits = self.fc(h_t)

        return logits

    def pe_grad_norm(self, loss, batch_size, device):
        grad_norm = torch.zeros(batch_size, device=device)

        pre_acts = self.lstm.pre_activation
        pre_acts.append(self.fc.pre_activation)
        Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True)

        grad_norm.add_(self.lstm.pe_grad_sqnorm(Z_grad[:-1]))
        grad_norm.add_(self.fc.pe_grad_sqnorm(Z_grad[-1]))
            
        grad_norm.sqrt_()

        return grad_norm
Beispiel #12
0
class TransformerModel(nn.Module):
    def __init__(self, n_token, n_classes, d_model=512, n_layers=2,
                 n_head=8, n_hidden=2048, dropout=0.1, max_seq_len=512,
                 embeddings=None, train_alg='batch'):
        super(TransformerModel, self).__init__()

        self.train_alg = train_alg
        self.d_model = d_model
        self.n_head = n_head

        if embeddings is None:            
            self.token_embedding = nn.Embedding(n_token, d_model)
        else:
            self.token_embedding = nn.Embedding.from_pretrained(embeddings)
            self.token_embedding.weight.requires_grad = False

        self.pos_encoder = PositionalEncoding(d_model, dropout, max_seq_len)        
        encoder_layers = TransformerEncoderLayer(d_model, n_head, n_hidden, dropout)
        # encoder_norm = nn.LayerNorm(d_model)
        encoder_norm = None
        self.encoder = TransformerEncoder(encoder_layers, n_layers, encoder_norm)
        self.fc= Linear(d_model, n_classes)

    def init_weights(self):
        initrange = 0.1
        self.token_embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)

    def forward(self, x):
        # positions = torch.arange(len(x), device=x.device).unsqueeze(-1)

        x = x.transpose(0, 1)
        # [sentence length, batch_size]
        x = self.token_embedding(x)
        # [sentence length, batch_size, embedding dim]
        x = self.pos_encoder(x)
        # x = x + self.pos_encoder(positions).expand_as(x)
        
        # [sentence length, batch_size, embedding dim]
        output = self.encoder(x)
        # [sentence length, batch_size, embedding dim]
        avg_out = output.transpose(0, 1).mean(dim=1)
        # [batch_size, embedding dim]
        preact = self.fc(avg_out)
        
        # [batch_size, num_classes]
        # return F.log_softmax(output, dim=-1)
        return preact

    def per_example_gradient(self, loss):
        grads = []
        pre_acts = []

        pre_acts.extend(self.encoder.collect_preactivations())
        pre_acts.append(self.fc.pre_activation)

        pre_acts = [m.pre_activ for m in modules]
        Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True)
        for m, zgrad in zip(modules, Z_grad):
            m.save_grad(zgrad)
        # loss.backward(retain_graph=True)        

        # TransformerEncoder
        grads.extend(self.encoder.per_example_gradient())

        # fully connected layer
        grads.extend(self.fc.per_example_gradient())

        return grads

    def pe_grad_norm(self, loss, batch_size, device):
        grad_norm = torch.zeros(batch_size, device=device)

        pre_acts = []
        pre_acts.extend(self.encoder.collect_preactivations())
        pre_acts.append(self.fc.pre_activation)

        Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True)

        grad_norm.add_(self.encoder.pe_grad_sqnorm(Z_grad[:-1]))
        grad_norm.add_(self.fc.pe_grad_sqnorm(Z_grad[-1]))        
        grad_norm.sqrt_()

        return grad_norm
Beispiel #13
0
 def __init__(self, input_size, hidden_size, output_size, train_alg='batch'):
     super(SimpleLSTM, self).__init__()
     
     self.lstm = LSTMCell(input_size, hidden_size)
     self.fc = Linear(hidden_size, output_size)
     self.train_alg = train_alg
Beispiel #14
0
class TransformerEncoderLayer(nn.Module):
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 pe_grad=True):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)

        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = _get_activation_fn(activation)

        self._pe_modules = [
            self.self_attn, self.linear1, self.linear2, self.norm1, self.norm2
        ]

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src = self.self_attn(src,
                             src,
                             src,
                             attn_mask=src_mask,
                             key_padding_mask=src_key_padding_mask)
        src = src + self.dropout1(src)
        src = self.norm1(src)

        if hasattr(self, "activation"):
            src = self.linear2(self.dropout(self.activation(
                self.linear1(src))))
        else:  # for backward compatibility
            src = self.linear2(self.dropout(F.relu(self.linear1(src))))
        out = src + self.dropout2(src)
        out = self.norm2(out)

        return out

    def per_example_gradient(self):
        grads = []

        for m in self._pe_modules:
            grads.extend(m.per_example_gradient())

        return grads

    def pe_grad_sqnorm(self, deriv_pre_activ):
        batch_size = deriv_pre_activ[0].size(1)
        device = deriv_pre_activ[0].device
        grad_sq_norm = torch.zeros(batch_size, device=device)

        grad_sq_norm.add_(self.self_attn.pe_grad_sqnorm(deriv_pre_activ[:2]))
        grad_sq_norm.add_(self.linear1.pe_grad_sqnorm(deriv_pre_activ[2]))
        grad_sq_norm.add_(self.linear2.pe_grad_sqnorm(deriv_pre_activ[3]))
        grad_sq_norm.add_(self.norm1.pe_grad_sqnorm(deriv_pre_activ[4]))
        grad_sq_norm.add_(self.norm2.pe_grad_sqnorm(deriv_pre_activ[5]))

        return grad_sq_norm

    def collect_preactivations(self):
        pre_acts = []

        pre_acts.extend(self.self_attn.collect_preactivations())
        pre_acts.append(self.linear1.pre_activation)
        pre_acts.append(self.linear2.pre_activation)
        pre_acts.append(self.norm1.pre_activation)
        pre_acts.append(self.norm2.pre_activation)

        return pre_acts
Beispiel #15
0
class MultiheadAttention(nn.Module):
    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 bias=True,
                 add_bias_kv=False,
                 add_zero_attn=False):
        super(MultiheadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        self.in_proj = Linear(embed_dim, 3 * embed_dim)
        # self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))

        # if bias:
        #     self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
        # else:
        #     self.register_parameter('in_proj_bias', None)
        self.out_proj = Linear(embed_dim, embed_dim)

        self._reset_parameters()

    def _reset_parameters(self):
        xavier_uniform_(self.in_proj.weight)

        if self.in_proj.bias is not None:
            constant_(self.in_proj.bias, 0.)
            constant_(self.out_proj.bias, 0.)

    def forward(self,
                query,
                key,
                value,
                key_padding_mask=None,
                need_weights=True,
                attn_mask=None):
        attn_out, _ = multi_head_attention_forward(
            query,
            key,
            value,
            self.embed_dim,
            self.num_heads,
            self.in_proj,
            self.dropout,
            self.out_proj,
            training=self.training,
            key_padding_mask=key_padding_mask,
            need_weights=need_weights,
            attn_mask=attn_mask)

        return attn_out

    def per_example_gradient(self, deriv_pre_activ_in, deriv_pre_activ_out):
        pe_grad_weight_in, pe_grad_bias_in = \
            self.in_proj.per_example_gradient(deriv_pre_activ_in)
        pe_grad_weight_out, pe_grad_bias_out = \
            self.out_proj.per_example_gradient(deriv_pre_activ_out)

        return (pe_grad_weight_in, pe_grad_bias_in, pe_grad_weight_out,
                pe_grad_bias_out)

    def pe_grad_sqnorm(self, deriv_pre_activ):
        grads = self.per_example_gradient(*deriv_pre_activ)
        batch_size = grads[0].size(0)

        grad_sq_norm = torch.zeros(batch_size, device=grads[0].device)
        for grad in grads:
            grad_sq_norm.add_(grad.pow(2).view(batch_size, -1).sum(1))

        return grad_sq_norm

    def collect_preactivations(self):
        return (self.in_proj.pre_activation, self.out_proj.pre_activation)