def forward(self, text): embedded = self.embedding(text) x, (hn, cn) = self.hidden(embedded) # (seq_len, batch, hidden_size) x = x[-1, :, :] x = F.hardswish(self.decode(x)) y = F.linear(x, self.embedding.weight.data) return y
def train_batch(self, x, y, jump_aux=False, drop_final=False): layers = self.net(x) if self.feature_layer == 'logits': logits = layers['logits'] loss = F.cross_entropy(logits, y) return dict(loss=loss, logits=logits) feature_maps = layers[self.feature_layer] raw_attentions = layers[self.attention_layer] attention_maps_ = self.attentions(raw_attentions) dropout_mask = self.dropout( torch.ones([attention_maps_.shape[0], self.M, 1], device=x.device)) attention_maps = attention_maps_ * torch.unsqueeze(dropout_mask, -1) feature_maps, feature_maps_d = self.texture_enhance( feature_maps, attention_maps_) feature_maps_d = feature_maps_d - feature_maps_d.mean(dim=[2, 3], keepdim=True) feature_maps_d = feature_maps_d / ( torch.std(feature_maps_d, dim=[2, 3], keepdim=True) + 1e-8) feature_matrix_ = self.atp(feature_maps, attention_maps_) feature_matrix = feature_matrix_ * dropout_mask B, M, N = feature_matrix.size() if not jump_aux: aux_loss, feature_matrix_d = self.auxiliary_loss( feature_maps_d, attention_maps_, y) else: feature_matrix_d = self.atp(feature_maps_d, attention_maps_) aux_loss = 0 feature_matrix = feature_matrix.view(B, -1) feature_matrix = F.hardswish(self.projection_local(feature_matrix)) final = layers['final'] attention_maps = attention_maps.sum(dim=1, keepdim=True) final = self.atp(final, attention_maps, norm=1).squeeze(1) final = self.dropout_final(final) projected_final = F.hardswish(self.project_final(final)) #projected_final=self.dropout(projected_final.view(B,1,-1)).view(B,-1) if drop_final: projected_final *= 0 feature_matrix = torch.cat((feature_matrix, projected_final), 1) ensemble_logit = self.ensemble_classifier_fc(feature_matrix) ensemble_loss = F.cross_entropy(ensemble_logit, y) return dict(ensemble_loss=ensemble_loss, aux_loss=aux_loss, attention_maps=attention_maps_, ensemble_logit=ensemble_logit, feature_matrix=feature_matrix_, feature_matrix_d=feature_matrix_d)
def forward(self, src: Tensor, tgt: Tensor) -> Tensor: r"""Forward propagate data. Args: src: Input to create the hidden context vector. tgt: Expected output. Shapes: src: (S, N, E) tgt: (T, N, E) """ T, N, E = tgt.shape assert T == self.tgt_window, ( f"The output sequence length must be the same length the target " f"window. {T} != {self.tgt_window}") tgt_future_mask = self.future_token_square_mask(T) assert src.shape[-1] == self.n_in_features, ( f"The shape must be of size time_features + linear_features.") time_features = src[:, :, :self.n_time_features] linear_features = src[:, :, -self.n_linear_features:] assert time_features.shape[-1] > 0, ( "There should at least be one time feature used.") assert linear_features.shape[-1] > 0, ( "There should at least be one linear feature used.") time_embeddings = self.time_embedding(time_features) * math.sqrt( self.d_time_embed) linear_embeddings = F.hardswish(self.linear_embedding(linear_features)) src_embeddings = F.hardswish( torch.cat([time_embeddings, linear_embeddings], dim=-1)) encoded = self.encoder(src_embeddings) tgt_embeddings = self.tgt_embedding(tgt) decoder = self.decoder(tgt_embeddings, encoded, tgt_mask=tgt_future_mask) out = self.projection(decoder) return out
def forward(self, src: Tensor) -> Tensor: r"""Forward propagate data. It expects the src tensor to be of shape (S, N, F) where the first time_features E are used in the Time2Vec model. The model will consume the first time_features from the src tensor and create time embeddings via a Time2Vec model. It will then pass the remaining linear_features into a standard linear layer to be concatenated with the time embeddings. Args: src: features Shapes: src: (S, N, F) out: (S, N, P) """ assert (src.shape[-1] == self.n_in_features ), "The shape must be of size time_features + linear_features." time_features = src[:, :, :self.n_time_features] linear_features = src[:, :, self.n_time_features:] assert (time_features.shape[-1] > 0), "There should at least be one time feature used." assert (linear_features.shape[-1] > 0), "There should at least be one linear feature used." time_embeddings = self.time_embedding(time_features) * math.sqrt( self.d_time_embed) if self.positional_encoding is not None: time_embeddings = self.positional_encoding(time_embeddings) linear_proj = self.dropout1(self.linear_src(linear_features)) # Concatenate the time embeddings and linear features that were # previously separated. x = F.hardswish(torch.cat([time_embeddings, linear_proj], dim=-1)) assert x.shape[-1] == self.d_time_embed + self.d_linear_embed, ( "The dimensionality of the concatenated time embeddings and " "linear hidden dims must be equal to d_time_embed + d_linear_embed." ) encoded = F.hardswish(self.encoder(x)) out = self.projection(encoded) return out
def forward(self, x, y=0, train_batch=False, AG=None): if train_batch: if AG is None: return self.train_batch(x, y) else: loss_pack = self.train_batch(x, y) with torch.no_grad(): Xaug, index = AG.agda(x, loss_pack['attention_maps']) #self.eval() loss_pack2 = self.train_batch(Xaug, y, jump_aux=False) #self.train() loss_pack['AGDA_ensemble_loss'] = loss_pack2['ensemble_loss'] loss_pack['AGDA_aux_loss'] = loss_pack2['aux_loss'] one_hot = F.one_hot(index, self.M) loss_pack['match_loss'] = torch.mean( torch.norm(loss_pack2['feature_matrix_d'] - loss_pack['feature_matrix_d'], dim=-1) * (torch.ones_like(one_hot) - one_hot)) return loss_pack layers = self.net(x) if self.feature_layer == 'logits': logits = layers['logits'] return logits raw_attentions = layers[self.attention_layer] attention_maps = self.attentions(raw_attentions) feature_maps = layers[self.feature_layer] feature_maps, feature_maps_d = self.texture_enhance( feature_maps, attention_maps) feature_matrix = self.atp(feature_maps, attention_maps) B, M, N = feature_matrix.size() feature_matrix = self.dropout(feature_matrix) feature_matrix = feature_matrix.view(B, -1) feature_matrix = F.hardswish(self.projection_local(feature_matrix)) final = layers['final'] attention_maps2 = attention_maps.sum(dim=1, keepdim=True) final = self.atp(final, attention_maps2, norm=1).squeeze(1) projected_final = F.hardswish(self.project_final(final)) feature_matrix = torch.cat((feature_matrix, projected_final), 1) ensemble_logit = self.ensemble_classifier_fc(feature_matrix) return ensemble_logit
def forward(self, src: Tensor) -> Tensor: r"""Forward propagate data. Args: src: tensor containing time features. Shapes: src: (*, N, F) output: (*, N, E) """ linear = self.dropout1(self.linear_time_proj(src)) periodic = self.dropout2(self.activation(self.periodic_time_proj(src))) out = F.hardswish(torch.cat([linear, periodic], dim=-1)) out = self.dropout3(self.proj(out)) return out
def forward(self, x): return F.hardswish(x, inplace=self.inplace)
def forward(self, x): return F.hardswish(x)
def forward(self, input): return self.activation_post_process(F.hardswish(input))
def forward(self, x): h0 = F.hardswish(self.bn0(self.conv0(x))) h1 = F.hardswish(self.bn1(self.conv1(h0))) h2 = F.hardswish(self.bn2(self.conv2(h1)) + h0) h3 = F.hardswish(self.bn3(self.conv3(h2))) h4 = F.hardswish(self.bn4(self.conv4(h3)) + h2) h5 = F.hardswish(self.bn5(self.conv5(h4))) h6 = F.hardswish(self.bn6(self.conv6(h5)) + h4) h7 = F.hardswish(self.bn7(self.conv7(h6))) h8 = F.hardswish(self.bn8(self.conv8(h7)) + h6) h9 = F.hardswish(self.bn9(self.conv9(h8))) h10 = F.hardswish(self.bn10(self.conv10(h9)) + h8) h11 = F.hardswish(self.bn11(self.conv11(h10))) h12 = F.hardswish(self.bn12(self.conv12(h11)) + h10) h13 = F.hardswish(self.bn13(self.conv13(h12))) h14 = F.hardswish(self.bn14(self.conv14(h13)) + h12) h15 = F.hardswish(self.bn15(self.conv15(h14))) h16 = F.hardswish(self.bn16(self.conv16(h15)) + h14) h17 = F.hardswish(self.bn17(self.conv17(h16))) h18 = F.hardswish(self.bn18(self.conv18(h17)) + h16) h19 = F.hardswish(self.bn19(self.conv19(h18))) h20 = F.hardswish(self.bn20(self.conv20(h19)) + h18) h21 = F.hardswish(self.bn21(self.conv21(h20))) h22 = F.hardswish(self.bn22(self.conv22(h21)) + h20) h23 = F.hardswish(self.bn23(self.conv23(h22))) h24 = F.hardswish(self.bn24(self.conv24(h23)) + h22) h25 = F.hardswish(self.bn25(self.conv25(h24))) h26 = F.hardswish(self.bn26(self.conv26(h25)) + h24) h27 = F.hardswish(self.bn27(self.conv27(h26))) h28 = F.hardswish(self.bn28(self.conv28(h27)) + h26) h29 = F.hardswish(self.bn29(self.conv29(h28))) h30 = F.hardswish(self.bn30(self.conv30(h29)) + h28) h31 = F.hardswish(self.bn31(self.conv31(h30))) h32 = F.hardswish(self.bn32(self.conv32(h31)) + h30) h33 = F.hardswish(self.bn33(self.conv33(h32))) h34 = F.hardswish(self.bn34(self.conv34(h33)) + h32) h35 = F.hardswish(self.bn35(self.conv35(h34))) h36 = F.hardswish(self.bn36(self.conv36(h35)) + h34) h37 = F.hardswish(self.bn37(self.conv37(h36))) #loss = policy_loss h38 = F.hardswish(self.bn38(self.conv38(h37)) + h36) #policy network h_p1 = F.hardswish(self.bn_p1(self.conv_p1(h38))) h_p1 = torch.flatten(h_p1,1) out_p = self.fc_p2(h_p1) # value network h_v1 = F.hardswish(self.bn_v1(self.conv_v1(h38))) h_v1 = torch.flatten(h_v1,1) h_v2 = F.hardswish(self.fc_v2(h_v1)) out_v = torch.tanh(self.fc_v3(h_v2)) return (out_p, out_v)
def forward(self, x, y, z, w): x = F.hardswish(x) y = hardswish_forward_0(y) z = hardswish_forward_1(z) w = hardswish_forward_2(w) return x, y, z, w
def forward(self, input: torch.Tensor) -> torch.Tensor: return F.hardswish(input)
def optimize_layer(self, node, float_layer, layer_inputs, layer_act_group, net_inputs, net_loss, last_quant_mods, device): batch_factor = 0.5 if layer_inputs[0].size(0) == 1 else 1 layer = node.module float_data = np.fabs( float_layer.weight.cpu().detach().numpy().flatten()) quant_data = np.fabs(layer.weight.cpu().detach().numpy().flatten()) q_noise = np.square(float_data - quant_data).mean() sqnr = 10 * np.log10(np.square(float_data).mean() / q_noise) quantize_efficiency = sqnr / 8.0 lr_factor = NndctOption.nndct_finetune_lr_factor.value lr_factor = lr_factor * batch_factor if quantize_efficiency > 4.5: lr_factor = 0.1 * lr_factor * batch_factor lr_w = lr_factor * layer.weight.std().item() # lr_w=1e-3 opt_weight = torch.optim.Adam([layer.weight], lr=lr_w) opt_bias = None lr_b = 0 if hasattr(layer, "bias") and layer.bias is not None: if layer.bias.flatten().shape[0] == 1: lr_b = 0.0 else: lr_b = lr_factor * layer.bias.std().item() # lr_b = lr_factor * layer.bias.std().item() # lr_b=1e-3 opt_bias = torch.optim.Adam([layer.bias], lr=lr_b) #print(f"learning rate: lr_w={lr_w}, lr_b={lr_b}") #print(f"pre quant efficiency:{quantize_efficiency}") iters = 20 total_loss = AverageMeter("layer_loss") best_params = self.get_layer_params(layer) handlers = self.hook_cache_output([float_layer]) for input_args in zip(*net_inputs): with torch.no_grad(): f_model = self._float_model.to(device) f_model.eval() new_input_args = [] for ip in input_args: if isinstance(ip, torch.Tensor): new_input_args.append(ip.to(device)) _ = f_model(*new_input_args) torch.cuda.empty_cache() self.clean_hooks(handlers) for i in range(iters): for idx, layer_input in enumerate(layer_inputs): train_output = self._cached_outputs[float_layer][idx].to( device) qout = layer(layer_input.to(device)) # train_output = train_output.to(device) if node in layer_act_group: act_node = layer_act_group[node] q_act_layer = act_node.module inplace = q_act_layer.inplace q_act_layer.inplace = False qout = q_act_layer(qout) q_act_layer.inplace = inplace if act_node.op.type == NNDCT_OP.RELU: train_output = F.relu(train_output) elif act_node.op.type == NNDCT_OP.RELU6: train_output = F.relu6(train_output) elif act_node.op.type == NNDCT_OP.HSIGMOID: train_output = F.hardsigmoid(train_output) elif act_node.op.type == NNDCT_OP.HSWISH: train_output = F.hardswish(train_output) else: raise NotImplementedError() if NndctOption.nndct_quant_opt.value > 0: loss = F.mse_loss(qout, train_output) + F.mse_loss( layer.weight, float_layer.weight.detach().to(device)) else: loss = F.mse_loss(qout, train_output) total_loss.update(loss.item()) opt_weight.zero_grad() if opt_bias: opt_bias.zero_grad() loss.backward() opt_weight.step() if opt_bias: opt_bias.step() float_data = np.fabs(layer.weight.cpu().detach().numpy().flatten()) layer.param_quantized = False handlers = self.hook_cache_output(last_quant_mods, hook_type="single") eval_loss = self.eval_loss(net_inputs, last_quant_mods, device) self.clean_hooks(handlers) quant_data = np.fabs(layer.weight.cpu().detach().numpy().flatten()) q_noise = np.square(float_data - quant_data).mean() sqnr = 10 * np.log10(np.square(float_data).mean() / q_noise) quantize_efficiency = sqnr / 8.0 #print(f"post quant efficiency:{quantize_efficiency}") # print(f"eval loss:{eval_loss} best loss:{net_loss}") if eval_loss < net_loss: best_params = self.get_layer_params(layer) net_loss = eval_loss else: self.set_layer_params(layer, best_params[0], best_params[1]) break # self.set_layer_params(layer, best_params[0], best_params[1]) #print(f"{node.name}\n{total_loss}") #print(f"opt net loss:{net_loss}") # self.clean_hooks() del self.cached_outputs[float_layer] # del cached_outputs torch.cuda.empty_cache() # print(f"iter:{i}") return net_loss
def hardswish(input, *args, **kwargs): return _wrap_tensor(input, F.hardswish(input.F, *args, **kwargs))
def forward(self, input: Tensor) -> Tensor: return F.hardswish(input, self.inplace)
def forward(self, input): return F.hardswish(input, inplace=self.inplace)
def forward(self, input): return F.hardswish(input)