def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr( config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads)) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = quantized_Linear(config.hidden_size, self.all_head_size, bitW=config.bitW) self.key = quantized_Linear(config.hidden_size, self.all_head_size, bitW=config.bitW) self.value = quantized_Linear(config.hidden_size, self.all_head_size, bitW=config.bitW) self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def __init__(self, config): super().__init__() self.config = config self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.attention = AlbertAttention(config) self.ffn = quantized_Linear(config.hidden_size, config.intermediate_size, bitW=config.bitW) self.ffn_output = quantized_Linear(config.intermediate_size, config.hidden_size, bitW=config.bitW) self.activation = ACT2FN[config.hidden_act]
def __init__(self, block, layers, first_stride=1, num_classes=10, bitW=1): super(ResNet_Cifar, self).__init__() self.bitW = bitW self.inplanes = 16 # self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False) self.conv1 = quantized_CNN(3, 16, kernel_size=3, stride=first_stride, padding=1, bias=False, bitW=self.bitW) self.bn1 = nn.BatchNorm2d(16) self.relu = nn.ReLU(inplace=True) self.layer1 = self._make_layer(block, 16, layers[0]) self.layer2 = self._make_layer(block, 32, layers[1], stride=2) self.layer3 = self._make_layer(block, 64, layers[2], stride=2) self.avgpool = nn.AvgPool2d(8, stride=1) # self.fc = nn.Linear(64 * block.expansion, num_classes) self.fc = quantized_Linear(64 * block.expansion, num_classes, bitW=self.bitW) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_()
def __init__(self, config): super().__init__() self.dense = quantized_Linear(config.intermediate_size, config.hidden_size, bitW=config.bitW) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super().__init__() self.dense = quantized_Linear(config.hidden_size, config.intermediate_size, bitW=config.bitW) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = AlbertEmbeddings(config) self.encoder = AlbertTransformer(config) self.pooler = quantized_Linear(config.hidden_size, config.hidden_size, bitW=config.bitW) self.pooler_activation = nn.Tanh() self.init_weights()
def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.hidden_size = config.hidden_size self.attention_head_size = config.hidden_size // config.num_attention_heads self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.dense = quantized_Linear(config.hidden_size, config.hidden_size, bitW=config.bitW) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.pruned_heads = set()
def __init__(self, config): super(QuantBertSelfAttention, self).__init__() if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads)) self.output_attentions = config.output_attentions # False self.num_attention_heads = config.num_attention_heads # 12 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) # 768 / 12 = 64 self.all_head_size = self.num_attention_heads * self.attention_head_size # 12 * 64 = 768 # print(config.bitW) # input() self.query = quantized_Linear(config.hidden_size, self.all_head_size, bitW=config.bitW) self.key = quantized_Linear(config.hidden_size, self.all_head_size, bitW=config.bitW) self.value = quantized_Linear(config.hidden_size, self.all_head_size, bitW=config.bitW) # It actually contains the #num_attention_heads projector: # [hidden_size (128), attention_head_size (64)] * [num_attention_heads (12)] # self.query = nn.Linear(config.hidden_size, self.all_head_size) # [128, 768] # self.key = nn.Linear(config.hidden_size, self.all_head_size) # self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def __init__(self, config): super().__init__() self.dense = quantized_Linear(config.hidden_size, config.hidden_size, bitW=config.bitW) self.activation = nn.Tanh()