Exemple #1
0
    def __init__(self, config):
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
                config, "embedding_size"):
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" %
                (config.hidden_size, config.num_attention_heads))
        self.output_attentions = config.output_attentions

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size /
                                       config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = quantized_Linear(config.hidden_size,
                                      self.all_head_size,
                                      bitW=config.bitW)
        self.key = quantized_Linear(config.hidden_size,
                                    self.all_head_size,
                                    bitW=config.bitW)
        self.value = quantized_Linear(config.hidden_size,
                                      self.all_head_size,
                                      bitW=config.bitW)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
    def __init__(self, config):
        super().__init__()

        self.config = config
        self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size,
                                                  eps=config.layer_norm_eps)
        self.attention = AlbertAttention(config)
        self.ffn = quantized_Linear(config.hidden_size,
                                    config.intermediate_size,
                                    bitW=config.bitW)
        self.ffn_output = quantized_Linear(config.intermediate_size,
                                           config.hidden_size,
                                           bitW=config.bitW)
        self.activation = ACT2FN[config.hidden_act]
    def __init__(self, block, layers, first_stride=1, num_classes=10, bitW=1):
        super(ResNet_Cifar, self).__init__()
        self.bitW = bitW
        self.inplanes = 16
        # self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.conv1 = quantized_CNN(3,
                                   16,
                                   kernel_size=3,
                                   stride=first_stride,
                                   padding=1,
                                   bias=False,
                                   bitW=self.bitW)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(block, 16, layers[0])
        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
        self.avgpool = nn.AvgPool2d(8, stride=1)
        # self.fc = nn.Linear(64 * block.expansion, num_classes)
        self.fc = quantized_Linear(64 * block.expansion,
                                   num_classes,
                                   bitW=self.bitW)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
Exemple #4
0
 def __init__(self, config):
     super().__init__()
     self.dense = quantized_Linear(config.intermediate_size,
                                   config.hidden_size,
                                   bitW=config.bitW)
     self.LayerNorm = BertLayerNorm(config.hidden_size,
                                    eps=config.layer_norm_eps)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
Exemple #5
0
 def __init__(self, config):
     super().__init__()
     self.dense = quantized_Linear(config.hidden_size,
                                   config.intermediate_size,
                                   bitW=config.bitW)
     if isinstance(config.hidden_act, str):
         self.intermediate_act_fn = ACT2FN[config.hidden_act]
     else:
         self.intermediate_act_fn = config.hidden_act
    def __init__(self, config):
        super().__init__(config)

        self.config = config
        self.embeddings = AlbertEmbeddings(config)
        self.encoder = AlbertTransformer(config)
        self.pooler = quantized_Linear(config.hidden_size,
                                       config.hidden_size,
                                       bitW=config.bitW)
        self.pooler_activation = nn.Tanh()

        self.init_weights()
    def __init__(self, config):
        super().__init__(config)

        self.output_attentions = config.output_attentions
        self.num_attention_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.attention_head_size = config.hidden_size // config.num_attention_heads
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.dense = quantized_Linear(config.hidden_size,
                                      config.hidden_size,
                                      bitW=config.bitW)
        self.LayerNorm = nn.LayerNorm(config.hidden_size,
                                      eps=config.layer_norm_eps)
        self.pruned_heads = set()
Exemple #8
0
    def __init__(self, config):
        super(QuantBertSelfAttention, self).__init__()
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
        self.output_attentions = config.output_attentions # False

        self.num_attention_heads = config.num_attention_heads # 12
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads) # 768 / 12 = 64
        self.all_head_size = self.num_attention_heads * self.attention_head_size # 12 * 64 = 768

        # print(config.bitW)
        # input()
        self.query = quantized_Linear(config.hidden_size, self.all_head_size, bitW=config.bitW)
        self.key = quantized_Linear(config.hidden_size, self.all_head_size, bitW=config.bitW)
        self.value = quantized_Linear(config.hidden_size, self.all_head_size, bitW=config.bitW)
        # It actually contains the #num_attention_heads projector:
        #  [hidden_size (128), attention_head_size (64)] * [num_attention_heads (12)]
        # self.query = nn.Linear(config.hidden_size, self.all_head_size)  # [128, 768]
        # self.key = nn.Linear(config.hidden_size, self.all_head_size)
        # self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
Exemple #9
0
 def __init__(self, config):
     super().__init__()
     self.dense = quantized_Linear(config.hidden_size,
                                   config.hidden_size,
                                   bitW=config.bitW)
     self.activation = nn.Tanh()