def knowledge_distillation_loss(y_true, y_pred, nb_classes, alpha=0.2, beta=1): # Extract the one-hot encoded values and the softs separately so that we can create two objective functions y_true, y_true_softs = y_true[:, :nb_classes], y_true[:, nb_classes:] y_pred, y_pred_softs = y_pred[:, :nb_classes], y_pred[:, nb_classes:] loss = alpha * logloss(y_true, y_pred) + beta * logloss( y_true_softs, y_pred_softs) return loss
def loss(y_true, y_pred): y_soft = K.softmax(old_logits / temp) logits_pred = new_logits[:, :old_classes] y_pred_soft = K.softmax(logits_pred / temp) return sparselogloss(y_true, y_pred) + L * logloss(y_soft, y_pred_soft)
def KD_loss(y_true,y_pred,lambd=0.5,T=10.0): y_true,y_true_KD = y_true[:,:10],y_true[:,10:] y_pred,y_pred_KD = y_pred[:,:10],y_pred[:,10:] # Classic cross-entropy (without temperature) hard target CE_loss = logloss(y_true,y_pred) # KL-Divergence loss for softened output (with temperature) soft target Loss_Teacher KL_loss = T**2*KLD_Loss(y_true_KD,y_pred_KD) return (1-lambd)*CE_loss + lambd*KL_loss
def soft_logloss(self, y_true, y_pred): y_true_soft = y_true[:, self.num_class:] y_pred_soft = y_pred[:, self.num_class:] return logloss(y_true_soft, y_pred_soft)
def categorical_crossentropy(self, y_true, y_pred): y_true = y_true[:, :self.num_class] y_pred = y_pred[:, :self.num_class] return logloss(y_true, y_pred)
def distill_loss(self, y_true, y_pred): y_true, y_true_soft = y_true[:, :self.num_class], y_true[:, self.num_class:] # why 2d instead of 1d? because of batch? y_pred, y_pred_soft = y_pred[:, :self.num_class], y_pred[:, self.num_class:] return self.lambda_const*logloss(y_true, y_pred) + logloss(y_true_soft, y_pred_soft)
def knowledge_distillation_loss(input_distillation): y_pred, y_true, y_soft, y_pred_soft = input_distillation return (1 - args.lambda_const) * logloss(y_true, y_pred) + \ args.lambda_const * args.temperature * args.temperature * logloss(y_soft, y_pred_soft)