def train_one_epoch_mixup(model, train_loader, criterion, optimizer, epoch,
                          lr_scheduler, logger, loss_record, scaler, args):
    loss_record.reset()
    tic = time.time()

    model.train()
    for i, (data, labels) in enumerate(train_loader):
        data = data.cuda(args.gpu, non_blocking=True)
        labels = labels.cuda(args.gpu, non_blocking=True)

        data, labels_a, labels_b, lam = mixup_data(data, labels,
                                                   args.mixup_alpha)
        optimizer.zero_grad()
        with autocast(args.mix_precision_training):
            outputs = model(data)
            loss = mixup_criterion(criterion, outputs, labels_a, labels_b, lam)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        loss_record.update(loss)
        lr_scheduler.step()

        if i % args.log_interval == 0 and i != 0:
            logger.info(
                'Epoch {}, Node {}, GPU {}, Iter {}, {}:{:.5}, {} samples/s, lr: {:.5}.'
                .format(epoch, args.world, args.gpu, i, loss_record.name,
                        loss_record.get(),
                        int((i * args.batch_size) // (time.time() - tic)),
                        lr_scheduler.learning_rate))
Example #2
0
def train(net, train_loader, optimizer, criterion):
    losses = AverageMeter()
    accs = AverageMeter()
    net.train()
    alpha = 0.1
    for step, (x, y) in enumerate(tqdm(train_loader)):
        x = x.to(device)
        y = y.to(device)#.unsqueeze(-1).float()

        if params['mixup']:
            x, y_a, y_b, lam = mixup_data(x, y, alpha)
            y_ = net(x)
            loss = mixup_criterion(criterion, y_, y_a, y_b, lam)
        else:
            y_ = net(x)
            loss = criterion(y_, y)
        acc = cal_acc(y_, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.update(loss.item())
        accs.update(acc.item())
    return losses.avg, accs.avg
Example #3
0
def train_mixup():
    mixup_off_epoch = epochs if args.mixup_off_epoch == 0 else args.mixup_off_epoch
    for epoch in range(resume_epoch, epochs):
        train_sampler.set_epoch(epoch)
        loss_record.reset()
        alpha = args.mixup_alpha if epoch < mixup_off_epoch else 0
        tic = time.time()

        model.train()
        for i, (data, labels) in enumerate(train_data):
            data = data.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            data, labels_a, labels_b, lam = mixup_data(data, labels, alpha)
            optimizer.zero_grad()
            outputs = model(data)
            loss = mixup_criterion(Loss, outputs, labels_a, labels_b, lam)

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()

            loss_record.update(loss)
            lr_scheduler.step()

            if i % args.log_interval == 0 and i != 0:
                logger.info(
                    'Epoch {}, Iter {}, {}:{:.5}, {} samples/s.'.format(
                        epoch, i, loss_record.name, loss_record.get(),
                        int((i * batch_size) // (time.time() - tic))))

        train_speed = int(num_training_samples // (time.time() - tic))
        train_msg = 'Train Epoch {}: {}:{:.5}, {} samples/s, lr:{:.5}'.format(
            epoch, loss_record.name, loss_record.get(), train_speed,
            lr_scheduler.learning_rate)
        logger.info(train_msg)
        test(epoch)
Example #4
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    snet.train()
    decoder.train()
    train_loss = 0
    train_cls_loss = 0

    conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES))
    conf_mat_a = np.zeros((NUM_CLASSES, NUM_CLASSES))
    conf_mat_b = np.zeros((NUM_CLASSES, NUM_CLASSES))

    if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0:
        frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every
        decay_factor = learning_rate_decay_rate**frac
        current_lr = args.lr * decay_factor
        utils.set_lr(optimizer, current_lr)  # set the decayed rate
    else:
        current_lr = args.lr
    print('learning_rate: %s' % str(current_lr))

    for batch_idx, (img_teacher, img_student,
                    target) in enumerate(trainloader):

        if args.cuda:
            img_teacher = img_teacher.cuda(non_blocking=True)
            img_student = img_student.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

        optimizer.zero_grad()

        if args.augmentation:
            img_teacher, teacher_target_a, teacher_target_b, teacher_lam = mixup_data(
                img_teacher, target, 0.6)
            img_teacher, teacher_target_a, teacher_target_b = map(
                Variable, (img_teacher, teacher_target_a, teacher_target_b))

            img_student, student_target_a, student_target_b, student_lam = mixup_data(
                img_student, target, 0.6)
            img_student, student_target_a, student_target_b = map(
                Variable, (img_student, student_target_a, student_target_b))
        else:
            img_teacher, img_student, target = Variable(img_teacher), Variable(
                img_student), Variable(target)

        rb1_s, rb2_s, rb3_s, mimic_s, out_s = snet(img_student)
        rb1_t, rb2_t, rb3_t, mimic_t, out_t = tnet(img_teacher)

        if args.augmentation:
            cls_loss = mixup_criterion(Cls_crit, out_s, student_target_a,
                                       student_target_b, student_lam)
        else:
            cls_loss = Cls_crit(out_s, target)

        kd_loss = KD_T_crit(out_t, out_s)

        if args.distillation == 'KD':
            loss = 0.2 * cls_loss + 0.8 * kd_loss
        elif args.distillation == 'DE':
            new_rb1_s = decoder(rb1_s)
            decoder_loss = losses.styleLoss(img_teacher, new_rb1_s.cuda(),
                                            MSE_crit)
            loss = 0.2 * cls_loss + 0.8 * kd_loss + 0.1 * decoder_loss
        elif args.distillation == 'AS':
            rb2_loss = losses.Absdiff_Similarity(rb2_t, rb2_s).cuda()
            loss = 0.2 * cls_loss + 0.8 * kd_loss + 0.9 * rb2_loss
        elif args.distillation == 'DEAS':
            new_rb1_s = decoder(rb1_s)
            decoder_loss = losses.styleLoss(img_teacher, new_rb1_s.cuda(),
                                            MSE_crit)
            rb2_loss = losses.Absdiff_Similarity(rb2_t, rb2_s).cuda()
            loss = 0.2 * cls_loss + 0.8 * kd_loss + 0.1 * decoder_loss + 0.9 * rb2_loss
        elif args.distillation == 'SSDEAS':
            new_rb1_s = decoder(rb1_s)
            decoder_loss = losses.styleLoss(img_teacher, new_rb1_s.cuda(),
                                            MSE_crit)
            rb2_loss = losses.Absdiff_Similarity(rb2_t, rb2_s).cuda()
            loss = 0 * cls_loss + 0 * kd_loss + 0.1 * decoder_loss + 0.9 * rb2_loss
        else:
            raise Exception('Invalid distillation name...')

        loss.backward()
        utils.clip_gradient(optimizer, 0.1)
        optimizer.step()
        train_loss += loss.item()
        train_cls_loss += cls_loss.item()

        if args.augmentation:
            conf_mat_a += losses.confusion_matrix(out_s, student_target_a,
                                                  NUM_CLASSES)
            acc_a = sum([conf_mat_a[i, i] for i in range(conf_mat_a.shape[0])
                         ]) / conf_mat_a.sum()
            precision_a = np.array([
                conf_mat_a[i, i] / (conf_mat_a[i].sum() + 1e-10)
                for i in range(conf_mat_a.shape[0])
            ])
            recall_a = np.array([
                conf_mat_a[i, i] / (conf_mat_a[:, i].sum() + 1e-10)
                for i in range(conf_mat_a.shape[0])
            ])
            mAP_a = sum(precision_a) / len(precision_a)
            F1_score_a = (2 * precision_a * recall_a /
                          (precision_a + recall_a + 1e-10)).mean()

            conf_mat_b += losses.confusion_matrix(out_s, student_target_b,
                                                  NUM_CLASSES)
            acc_b = sum([conf_mat_b[i, i] for i in range(conf_mat_b.shape[0])
                         ]) / conf_mat_b.sum()
            precision_b = np.array([
                conf_mat_b[i, i] / (conf_mat_b[i].sum() + 1e-10)
                for i in range(conf_mat_b.shape[0])
            ])
            recall_b = np.array([
                conf_mat_b[i, i] / (conf_mat_b[:, i].sum() + 1e-10)
                for i in range(conf_mat_b.shape[0])
            ])
            mAP_b = sum(precision_b) / len(precision_b)
            F1_score_b = (2 * precision_b * recall_b /
                          (precision_b + recall_b + 1e-10)).mean()

            acc = student_lam * acc_a + (1 - student_lam) * acc_b
            mAP = student_lam * mAP_a + (1 - student_lam) * mAP_b
            F1_score = student_lam * F1_score_a + (1 -
                                                   student_lam) * F1_score_b

        else:
            conf_mat += losses.confusion_matrix(out_s, target, NUM_CLASSES)
            acc = sum([conf_mat[i, i]
                       for i in range(conf_mat.shape[0])]) / conf_mat.sum()
            precision = [
                conf_mat[i, i] / (conf_mat[i].sum() + 1e-10)
                for i in range(conf_mat.shape[0])
            ]
            mAP = sum(precision) / len(precision)

            recall = [
                conf_mat[i, i] / (conf_mat[:, i].sum() + 1e-10)
                for i in range(conf_mat.shape[0])
            ]
            precision = np.array(precision)
            recall = np.array(recall)
            f1 = 2 * precision * recall / (precision + recall + 1e-10)
            F1_score = f1.mean()

        #utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | mAP: %.3f%% | F1: %.3f%%'
        #% (train_loss/(batch_idx+1), 100.*acc, 100.* mAP, 100.* F1_score))

    return train_cls_loss / (batch_idx +
                             1), 100. * acc, 100. * mAP, 100 * F1_score
Example #5
0
def train_base(model, cost, optimizer, train_loader, test_loader, args):
    '''

    :param model: 要训练的模型
    :param cost: 损失函数
    :param optimizer: 优化器
    :param train_loader:  测试数据装载
    :param test_loader:  训练数据装载
    :param args: 配置参数
    :return:
    '''

    # 打印训练参数
    print(args)

    # 初始化,打开定时器,创建保存位置
    start = time.time()

    if not os.path.exists(args.model_root):
        os.makedirs(args.model_root)
    if not os.path.exists(args.log_root):
        os.makedirs(args.log_root)

    models_dir = args.model_root + '/' + args.model_name
    log_dir = args.log_root + '/' + args.log_name

    # 保存argv参数
    with open(log_dir, 'a+', newline='') as f:
        my_writer = csv.writer(f)
        args_dict = vars(args)
        for key, value in args_dict.items():
            my_writer.writerow([key, value])
        f.close()

    # 判断是否使用余弦衰减
    if args.lrcos:
        print("lrcos is using")
        cos_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.train_epoch, eta_min=0)

        # 是否使用学习率预热
        if args.lr_warmup:
            scheduler_warmup = GradualWarmupScheduler(args, optimizer, multiplier=1,
                                                      after_scheduler=cos_scheduler)

    # 训练初始化
    epoch_num = args.train_epoch  # 训练多少个epoch
    log_interval = args.log_interval  # 每隔多少个batch打印一次状态
    save_interval = args.save_interval  # 每隔多少个epoch 保存一次数据

    batch_num = 0
    train_loss = 0
    log_list = []  # 需要保存的log数据列表

    epoch = 0
    accuracy_best = 0

    # 如果是重新训练的过程,那么就读取之前训练的状态
    if args.retrain:
        if not os.path.exists(models_dir):
            print("no trained model")
        else:
            state_read = torch.load(models_dir)
            model.load_state_dict(state_read['model_state'])
            optimizer.load_state_dict(state_read['optim_state'])
            epoch = state_read['Epoch']
            print("retaining")

    # 训练
    while epoch < epoch_num:  # 以epoch为单位进行循环
        for batch_idx, (data, target) in enumerate(
                tqdm(train_loader, desc="Epoch {}/{}".format(epoch, epoch_num))):

            batch_num += 1
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()

            if args.mixup:
                mixup_alpha = args.mixup_alpha
                inputs, labels_a, labels_b, lam = mixup_data(data, target, alpha=mixup_alpha)

            optimizer.zero_grad()  # 优化器梯度初始化为零,不然会累加之前的梯度

            output = model(data)  # 把数据输入网络并得到输出,即进行前向传播

            if args.mixup:
                loss = mixup_criterion(cost, output, labels_a, labels_b, lam)
            else:
                loss = cost(output, target)

            train_loss += loss.item()
            loss.backward()  # 反向传播求出输出到每一个节点的梯度
            optimizer.step()  # 根据输出到每一个节点的梯度,优化更新参数```````````````````````````````````````````````````
            # if batch_idx % log_interval == 0:  # 准备打印相关信息,args.log_interval是最开头设置的好了的参数
            #     print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            #         epoch, batch_idx * len(data), len(train_loader.dataset),
            #                100. * batch_idx / len(train_loader), loss.item()))

        # 测试模型
        accuracy_test = calc_accuracy(model, loader=test_loader)
        if accuracy_test > accuracy_best:
            accuracy_best = accuracy_test
        log_list.append(train_loss / len(train_loader))
        log_list.append(accuracy_test)
        log_list.append(accuracy_best)
        print(
            "Epoch {}, loss={:.5f}, accuracy_test={:.5f},  accuracy_best={:.5f}".format(epoch,
                                                                                        train_loss / len(train_loader),
                                                                                        accuracy_test, accuracy_best))
        train_loss = 0

        if args.lrcos:
            if args.lr_warmup:
                scheduler_warmup.step(epoch=epoch)
            else:
                cos_scheduler.step(epoch=epoch)
        if epoch < 20:
            print(epoch, optimizer.param_groups[0]['lr'])

        # 保存模型和优化器参数
        if epoch % save_interval == 0:
            train_state = {
                "Epoch": epoch,
                "model_state": model.state_dict(),
                "optim_state": optimizer.state_dict(),
                "args": args
            }
            torch.save(train_state, models_dir)

        # 保存log
        with open(log_dir, 'a+', newline='') as f:
            # 训练结果
            my_writer = csv.writer(f)
            my_writer.writerow(log_list)
            log_list = []
        epoch = epoch + 1
    train_duration_sec = int(time.time() - start)
    print("training is end", train_duration_sec)
def train(train_loader, model, criterion, optimizer, epoch, args, TB):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    progress = ProgressMeter(len(train_loader),
                             [batch_time, data_time, losses, top1],
                             prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    # for i, (images, target) in enumerate(train_loader):
    for i, data in enumerate(train_loader):

        images = data['image']
        name = data['name']
        target = data['label']

        target = labelshaper(target, args.multi)
        # measure data loading time
        data_time.update(time.time() - end)

        if args.mixup > 0:
            left_, right_ = images
            mixed_images, labels_a, labels_b, lam = mixup_data(
                torch.cat((left_, right_), dim=1), target, args.mixup)

            left_ = mixed_images[:, 0].unsqueeze(1)
            right_ = mixed_images[:, 1].unsqueeze(1)
            images = [left_, right_]

        if args.gpu is not None:
            left_, right_ = images
            left_, right_ = left_.cuda(
                args.gpu, non_blocking=True), right_.cuda(args.gpu,
                                                          non_blocking=True)

            if args.mixup > 0:
                mixed_images, labels_a, labels_b, lam = mixup_data(
                    torch.cat((left_, right_), dim=1), target, args.mixup)

                left_ = mixed_images[:, 0].unsqueeze(1)
                right_ = mixed_images[:, 1].unsqueeze(1)

            images = [left_, right_]

        target = target.cuda(args.gpu, non_blocking=True)

        # compute output
        output = model(*images)

        if args.mixup > 0:
            loss = mixup_criterion(criterion, output, labels_a.cuda(),
                                   labels_b.cuda(), lam)
        else:
            loss = criterion(output, target)

        # measure accuracy and record loss
        acc1 = accuracy(output, target, topk=(1, ))
        losses.update(loss.item(), target.size(0))
        top1.update(acc1[0], target.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)

        # save for calculate auc score => One class problem
        if i == 0:
            targets, outputs = target, output
        elif i != 0:
            targets = torch.cat((targets, target))
            outputs = torch.cat((outputs, output))

    # TensorBoard update
    TB.update('Accuracy/Train', top1.avg, epoch)
    TB.update('Loss/Train', losses.avg, epoch)
    TB.update('AUC/Train', auc(outputs, targets), epoch)
def train_pixel_supervise(model, cost, optimizer, train_loader, test_loader,
                          args):
    '''

    :param model:
    :param cost:
    :param optimizer:
    :param train_loader:
    :param test_loader:
    :param args:
    :return:
    '''

    print(args)

    # Initialize and open timer
    start = time.time()

    if not os.path.exists(args.model_root):
        os.makedirs(args.model_root)
    if not os.path.exists(args.log_root):
        os.makedirs(args.log_root)

    models_dir = args.model_root + '/' + args.name + '.pth'
    log_dir = args.log_root + '/' + args.name + '.csv'

    # save args
    with open(log_dir, 'a+', newline='') as f:
        my_writer = csv.writer(f)
        args_dict = vars(args)
        for key, value in args_dict.items():
            my_writer.writerow([key, value])
        f.close()

    # Cosine learning rate decay
    if args.lrcos:
        print("lrcos is using")
        cos_scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=args.train_epoch, eta_min=0)

        if args.lr_warmup:
            scheduler_warmup = GradualWarmupScheduler(
                args, optimizer, multiplier=1, after_scheduler=cos_scheduler)

    # Training initialization
    epoch_num = args.train_epoch
    log_interval = args.log_interval
    save_interval = args.save_interval
    batch_num = 0
    train_loss = 0
    epoch = 0
    loss_best = 1e4
    log_list = []  # log need to save

    if args.retrain:
        if not os.path.exists(models_dir):
            print("no trained model")
        else:
            state_read = torch.load(models_dir)
            model.load_state_dict(state_read['model_state'])
            optimizer.load_state_dict(state_read['optim_state'])
            epoch = state_read['Epoch']
            print("retaining")

    # Train
    while epoch < epoch_num:
        for batch_idx, (data, target) in enumerate(
                tqdm(train_loader, desc="Epoch {}/{}".format(epoch,
                                                             epoch_num))):

            batch_num += 1

            target = torch.unsqueeze(target, dim=1)

            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()

            if args.mixup:
                mixup_alpha = args.mixup_alpha
                inputs, labels_a, labels_b, lam = mixup_data(data,
                                                             target,
                                                             alpha=mixup_alpha)

            optimizer.zero_grad()

            output = model(data)

            if args.mixup:
                loss = mixup_criterion(cost, output, labels_a, labels_b, lam)
            else:
                loss = cost(output, target)

            train_loss += loss.item()
            loss.backward()
            optimizer.step()
            # if batch_idx % log_interval == 0:  # 准备打印相关信息,args.log_interval是最开头设置的好了的参数
            #     print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            #         epoch, batch_idx * len(data), len(train_loader.dataset),
            #                100. * batch_idx / len(train_loader), loss.item()))

        # testing
        # no testing the result of mse is also the evluaate factor for depth estimate

        if train_loss / len(train_loader) < loss_best:
            loss_best = train_loss / len(train_loader)
            save_path = os.path.join(args.model_root, args.name + '.pth')
            torch.save(model.state_dict(), save_path)
        log_list.append(train_loss / len(train_loader))

        print("Epoch {}, loss={:.5f}".format(
            epoch,
            train_loss / len(train_loader),
        ))
        train_loss = 0
        if args.lrcos:
            if args.lr_warmup:
                scheduler_warmup.step(epoch=epoch)
            else:
                cos_scheduler.step(epoch=epoch)
        if epoch < 20:
            print(epoch, optimizer.param_groups[0]['lr'])

        # save model and para
        if epoch % save_interval == 0:
            train_state = {
                "Epoch": epoch,
                "model_state": model.state_dict(),
                "optim_state": optimizer.state_dict(),
                "args": args
            }
            models_dir = args.model_root + '/' + args.name + '.pt'
            torch.save(train_state, models_dir)

        #  save log
        with open(log_dir, 'a+', newline='') as f:
            # 训练结果
            my_writer = csv.writer(f)
            my_writer.writerow(log_list)
            log_list = []
        epoch = epoch + 1
    train_duration_sec = int(time.time() - start)
    print("training is end", train_duration_sec)
Example #8
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0

    conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES))
    conf_mat_a = np.zeros((NUM_CLASSES, NUM_CLASSES))
    conf_mat_b = np.zeros((NUM_CLASSES, NUM_CLASSES))

    if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0:
        frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every
        decay_factor = learning_rate_decay_rate**frac
        current_lr = args.lr * decay_factor
        utils.set_lr(optimizer, current_lr)  # set the decayed rate
    else:
        current_lr = args.lr
    print('learning_rate: %s' % str(current_lr))

    for batch_idx, (inputs, targets) in enumerate(trainloader):
        if use_cuda:
            inputs, targets = inputs.cuda(), targets.cuda()
        optimizer.zero_grad()

        if args.augmentation:
            inputs, targets_a, targets_b, lam = mixup_data(
                inputs, targets, 0.6)
            inputs, targets_a, targets_b = map(Variable,
                                               (inputs, targets_a, targets_b))
        else:
            inputs, targets = Variable(inputs), Variable(targets)

        _, _, _, _, outputs = net(inputs)

        if args.augmentation:
            loss = mixup_criterion(criterion, outputs, targets_a, targets_b,
                                   lam)
        else:
            loss = criterion(outputs, targets)

        loss.backward()
        utils.clip_gradient(optimizer, 0.1)
        optimizer.step()
        train_loss += loss.item()

        if args.augmentation:
            conf_mat_a += losses.confusion_matrix(outputs, targets_a,
                                                  NUM_CLASSES)
            acc_a = sum([conf_mat_a[i, i] for i in range(conf_mat_a.shape[0])
                         ]) / conf_mat_a.sum()
            precision_a = np.array([
                conf_mat_a[i, i] / (conf_mat_a[i].sum() + 1e-10)
                for i in range(conf_mat_a.shape[0])
            ])
            recall_a = np.array([
                conf_mat_a[i, i] / (conf_mat_a[:, i].sum() + 1e-10)
                for i in range(conf_mat_a.shape[0])
            ])
            mAP_a = sum(precision_a) / len(precision_a)
            F1_score_a = (2 * precision_a * recall_a /
                          (precision_a + recall_a + 1e-10)).mean()

            conf_mat_b += losses.confusion_matrix(outputs, targets_b,
                                                  NUM_CLASSES)
            acc_b = sum([conf_mat_b[i, i] for i in range(conf_mat_b.shape[0])
                         ]) / conf_mat_b.sum()
            precision_b = np.array([
                conf_mat_b[i, i] / (conf_mat_b[i].sum() + 1e-10)
                for i in range(conf_mat_b.shape[0])
            ])
            recall_b = np.array([
                conf_mat_b[i, i] / (conf_mat_b[:, i].sum() + 1e-10)
                for i in range(conf_mat_b.shape[0])
            ])
            mAP_b = sum(precision_b) / len(precision_b)
            F1_score_b = (2 * precision_b * recall_b /
                          (precision_b + recall_b + 1e-10)).mean()

            acc = lam * acc_a + (1 - lam) * acc_b
            mAP = lam * mAP_a + (1 - lam) * mAP_b
            F1_score = lam * F1_score_a + (1 - lam) * F1_score_b

        else:
            conf_mat += losses.confusion_matrix(outputs, targets, NUM_CLASSES)
            acc = sum([conf_mat[i, i]
                       for i in range(conf_mat.shape[0])]) / conf_mat.sum()
            precision = [
                conf_mat[i, i] / (conf_mat[i].sum() + 1e-10)
                for i in range(conf_mat.shape[0])
            ]
            mAP = sum(precision) / len(precision)

            recall = [
                conf_mat[i, i] / (conf_mat[:, i].sum() + 1e-10)
                for i in range(conf_mat.shape[0])
            ]
            precision = np.array(precision)
            recall = np.array(recall)
            f1 = 2 * precision * recall / (precision + recall + 1e-10)
            F1_score = f1.mean()

        #utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | mAP: %.3f%% | F1: %.3f%%'
        #% (train_loss/(batch_idx+1), 100.*acc, 100.* mAP, 100.* F1_score))

    return train_loss / (batch_idx + 1), 100. * acc, 100. * mAP, 100 * F1_score
Example #9
0
    def train_one_epoch(self, train_loader, val_loader):
        self.model.train()
        train_loss_sum, train_acc_sum = 0.0, 0.0
        for img, label in train_loader:
            # if len(label) <= 1:
            #     continue
            img, label = img.to(DEVICE), label.to(DEVICE)
            width, height = img.size(-1), img.size(-2)
            self.optimizer.zero_grad()
            if self.mix_up:
                img, labels_a, labels_b, lam = mixup_data(img,
                                                          label,
                                                          alpha=0.2)
                output = self.model(img)
                loss = mixup_criterion(self.criteration, output, labels_a,
                                       labels_b, lam)
            elif self.cutMix:
                img, targets = cutmix(img, label)
                target_a, target_b, lam = targets
                output = self.model(img)
                loss = self.criteration(output,
                                        target_a) * lam + self.criteration(
                                            output, target_b) * (1. - lam)
            elif self.fmix:
                data, target = fmix(img,
                                    label,
                                    alpha=1.,
                                    decay_power=3.,
                                    shape=(width, height))
                targets, shuffled_targets, lam = target
                output = self.model(data)
                loss = self.criteration(
                    output, targets) * lam + self.criteration(
                        output, shuffled_targets) * (1 - lam)
            else:
                output = self.model(img)
                loss = self.criteration(output, label)
            loss.backward()
            _, preds = torch.max(output.data, 1)
            correct = (preds == label).sum().item()
            train_acc_sum += correct

            train_loss_sum += loss.item()
            self.optimizer.step()

        train_loss = train_loss_sum / len(train_loader.dataset)
        train_acc = train_acc_sum / len(train_loader.dataset)

        val_acc_sum = 0.0
        valid_loss_sum = 0
        self.model.eval()
        for val_img, val_label in val_loader:
            # if len(val_label) <= 1:
            #     continue
            val_img, val_label = val_img.to(DEVICE), val_label.to(DEVICE)
            val_output = self.model(val_img)
            _, preds = torch.max(val_output.data, 1)
            correct = (preds == val_label).sum().item()
            val_acc_sum += correct

            loss = self.criteration(val_output, val_label)
            valid_loss_sum += loss.item()

        val_acc = val_acc_sum / len(val_loader.dataset)
        val_loss = valid_loss_sum / len(val_loader.dataset)
        return train_loss, train_acc, val_loss, val_acc
Example #10
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    snet.train()
    if args.model == 'VID':
        VID_NET1.train()
        VID_NET2.train()
    elif args.model == 'OFD':
        OFD_NET1.train()
        OFD_NET2.train()
    elif args.model == 'AFD':
        AFD_NET1.train()
        AFD_NET2.train()
    else:
        pass
    train_loss = 0
    train_cls_loss = 0

    conf_mat = np.zeros((NUM_CLASSES, NUM_CLASSES))
    conf_mat_a = np.zeros((NUM_CLASSES, NUM_CLASSES))
    conf_mat_b = np.zeros((NUM_CLASSES, NUM_CLASSES))

    if epoch > learning_rate_decay_start and learning_rate_decay_start >= 0:
        frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every
        decay_factor = learning_rate_decay_rate**frac
        current_lr = args.lr * decay_factor
        utils.set_lr(optimizer, current_lr)  # set the decayed rate
    else:
        current_lr = args.lr
    print('learning_rate: %s' % str(current_lr))

    for batch_idx, (img_teacher, img_student,
                    target) in enumerate(trainloader):

        if args.cuda:
            img_teacher = img_teacher.cuda(non_blocking=True)
            img_student = img_student.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

        optimizer.zero_grad()

        if args.augmentation:
            img_teacher, teacher_target_a, teacher_target_b, teacher_lam = mixup_data(
                img_teacher, target, 0.6)
            img_teacher, teacher_target_a, teacher_target_b = map(
                Variable, (img_teacher, teacher_target_a, teacher_target_b))

            img_student, student_target_a, student_target_b, student_lam = mixup_data(
                img_student, target, 0.6)
            img_student, student_target_a, student_target_b = map(
                Variable, (img_student, student_target_a, student_target_b))
        else:
            img_teacher, img_student, target = Variable(img_teacher), Variable(
                img_student), Variable(target)

        rb1_s, rb2_s, rb3_s, mimic_s, out_s = snet(img_student)
        rb1_t, rb2_t, rb3_t, mimic_t, out_t = tnet(img_teacher)

        if args.augmentation:
            cls_loss = mixup_criterion(Cls_crit, out_s, student_target_a,
                                       student_target_b, student_lam)
        else:
            cls_loss = Cls_crit(out_s, target)

        kd_loss = KD_T_crit(out_t, out_s)

        if args.model == 'Fitnet':
            #FITNETS: Hints for Thin Deep Nets
            if args.stage == 'Block1':
                Fitnet1_loss = other.Fitnet(rb1_t, rb1_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * Fitnet1_loss
            elif args.stage == 'Block2':
                Fitnet2_loss = other.Fitnet(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * Fitnet2_loss
            else:
                Fitnet1_loss = other.Fitnet(rb1_t, rb1_s).cuda()
                Fitnet2_loss = other.Fitnet(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * Fitnet1_loss + args.delta * Fitnet2_loss

        elif args.model == 'AT':  # An activation-based attention transfer with the sum of absolute values raised to the power of 2.
            #Paying More Attention to Attention: Improving the Performance of Convolutional Neural Networks via Attention Transfer
            if args.stage == 'Block1':
                AT1_loss = other.AT(rb1_t, rb1_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AT1_loss
            elif args.stage == 'Block2':
                AT2_loss = other.AT(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * AT2_loss
            else:
                AT1_loss = other.AT(rb1_t, rb1_s).cuda()
                AT2_loss = other.AT(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AT1_loss + args.delta * AT2_loss

        elif args.model == 'NST':  # NST (poly)
            #Like What You Like: Knowledge Distill via Neuron Selectivity Transfer
            if args.stage == 'Block1':
                NST1_loss = other.NST(rb1_t, rb1_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * NST1_loss
            elif args.stage == 'Block2':
                NST2_loss = other.NST(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * NST2_loss
            else:
                NST1_loss = other.NST(rb1_t, rb1_s).cuda()
                NST2_loss = other.NST(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * NST1_loss + args.delta * NST2_loss

        elif args.model == 'PKT':  # PKT
            #Learning Deep Representations with Probabilistic Knowledge Transfer
            if args.stage == 'Block1':
                PKT1_loss = other.PKT(rb1_t, rb1_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * PKT1_loss
            elif args.stage == 'Block2':
                PKT2_loss = other.PKT(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * PKT2_loss
            else:
                PKT1_loss = other.PKT(rb1_t, rb1_s).cuda()
                PKT2_loss = other.PKT(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * PKT1_loss + args.delta * PKT2_loss

        elif args.model == 'AB':  # AB
            #Knowledge Transfer via Distillation of Activation Boundaries Formed by Hidden Neurons
            if args.stage == 'Block1':
                AB1_loss = other.AB(rb1_t, rb1_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AB1_loss
            elif args.stage == 'Block2':
                AB2_loss = other.AB(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * AB2_loss
            else:
                AB1_loss = other.AB(rb1_t, rb1_s).cuda()
                AB2_loss = other.AB(rb2_t, rb2_s).cuda()
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AB1_loss + args.delta * AB2_loss

        elif args.model == 'CCKD':  #
            #Correlation Congruence for Knowledge Distillation
            if args.stage == 'Block1':
                CCKD1_loss = other.CCKD().cuda()(rb1_t, rb1_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * CCKD1_loss
            elif args.stage == 'Block2':
                CCKD2_loss = other.CCKD().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * CCKD2_loss
            else:
                CCKD1_loss = other.CCKD().cuda()(rb1_t, rb1_s)
                CCKD2_loss = other.CCKD().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * CCKD1_loss + args.delta * CCKD2_loss

        elif args.model == 'RKD':  # RKD-DA
            #Relational Knowledge Disitllation
            if args.stage == 'Block1':
                RKD1_loss = other.RKD().cuda()(rb1_t, rb1_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * RKD1_loss
            elif args.stage == 'Block2':
                RKD2_loss = other.RKD().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * RKD2_loss
            else:
                RKD1_loss = other.RKD().cuda()(rb1_t, rb1_s)
                RKD2_loss = other.RKD().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * RKD1_loss + args.delta * RKD2_loss

        elif args.model == 'SP':  # SP
            #Similarity-Preserving Knowledge Distillation
            if args.stage == 'Block1':
                SP1_loss = other.SP().cuda()(rb1_t, rb1_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * SP1_loss
            elif args.stage == 'Block2':
                SP2_loss = other.SP().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * SP2_loss
            else:
                SP1_loss = other.SP().cuda()(rb1_t, rb1_s)
                SP2_loss = other.SP().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * SP1_loss + args.delta * SP2_loss

        elif args.model == 'VID':  # VID-I
            #Variational Information Distillation for Knowledge Transfer
            if args.stage == 'Block1':
                VID1_loss = VID_NET1(rb1_t, rb1_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VID1_loss
            elif args.stage == 'Block2':
                VID2_loss = VID_NET2(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * VID2_loss
            else:
                VID1_loss = VID_NET1(rb1_t, rb1_s)
                VID2_loss = VID_NET2(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VID1_loss + args.delta * VID2_loss

        elif args.model == 'OFD':  # OFD
            #A Comprehensive Overhaul of Feature Distillation
            if args.stage == 'Block1':
                OFD1_loss = OFD_NET1(rb1_t, rb1_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * OFD1_loss
            elif args.stage == 'Block2':
                OFD2_loss = OFD_NET2(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * OFD2_loss
            else:
                OFD1_loss = OFD_NET1.cuda()(rb1_t, rb1_s)
                OFD2_loss = OFD_NET2(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * OFD1_loss + args.delta * OFD2_loss

        elif args.model == 'AFDS':  #
            #Pay Attention to Features, Transfer Learn Faster CNNs
            if args.stage == 'Block1':
                AFD1_loss = AFD_NET1(rb1_t, rb1_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AFD1_loss
            elif args.stage == 'Block2':
                AFD2_loss = AFD_NET2(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * AFD2_loss
            else:
                AFD1_loss = AFD_NET1(rb1_t, rb1_s)
                AFD2_loss = AFD_NET2(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * AFD1_loss + args.delta * AFD2_loss

        elif args.model == 'FT':  #
            #Paraphrasing Complex Network: Network Compression via Factor Transfer
            if args.stage == 'Block1':
                FT1_loss = other.FT().cuda()(rb1_t, rb1_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * FT1_loss
            elif args.stage == 'Block2':
                FT2_loss = other.FT().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.delta * FT2_loss
            else:
                FT1_loss = other.FT().cuda()(rb1_t, rb1_s)
                FT2_loss = other.FT().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * FT1_loss + args.delta * FT2_loss

        elif args.model == 'CD':  # CD+GKD+CE
            #Channel Distillation: Channel-Wise Attention for Knowledge Distillation
            if args.stage == 'Block1':
                kd_loss_v2 = other.KDLossv2(args.T).cuda()(out_t, out_s,
                                                           target)
                CD1_loss = other.CD().cuda()(rb1_t, rb1_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss_v2 + args.gamma * CD1_loss
            elif args.stage == 'Block2':
                kd_loss_v2 = other.KDLossv2(args.T).cuda()(out_t, out_s,
                                                           target)
                CD2_loss = other.CD().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss_v2 + args.delta * CD2_loss
            else:
                kd_loss_v2 = other.KDLossv2(args.T).cuda()(out_t, out_s,
                                                           target)
                CD1_loss = other.CD().cuda()(rb1_t, rb1_s)
                CD2_loss = other.CD().cuda()(rb2_t, rb2_s)
                loss = args.alpha * cls_loss + args.beta * kd_loss_v2 + args.gamma * CD1_loss + args.delta * CD2_loss

        elif args.model == 'FAKD':  # DS+TS+SA
            #FAKD: Feature-Affinity Based Knowledge Distillation for Efficient Image Super-Resolution
            if args.stage == 'Block1':
                FAKD_DT_loss = other.FAKD_DT().cuda()(out_t, out_s, target,
                                                      NUM_CLASSES)
                FAKD_SA1_loss = other.FAKD_SA().cuda()(rb1_t, rb1_s)
                loss = args.alpha * FAKD_DT_loss + args.gamma * FAKD_SA1_loss  # No T
            elif args.stage == 'Block2':
                FAKD_DT_loss = other.FAKD_DT().cuda()(out_t, out_s, target,
                                                      NUM_CLASSES)
                FAKD_SA2_loss = other.FAKD_SA().cuda()(rb2_t, rb2_s)
                loss = args.alpha * FAKD_DT_loss + args.gamma * FAKD_SA2_loss
            else:
                FAKD_DT_loss = other.FAKD_DT().cuda()(out_t, out_s, target,
                                                      NUM_CLASSES)
                FAKD_SA1_loss = other.FAKD_SA().cuda()(rb1_t, rb1_s)
                FAKD_SA2_loss = other.FAKD_SA().cuda()(rb2_t, rb2_s)
                loss = args.alpha * FAKD_DT_loss + args.gamma * FAKD_SA1_loss + args.delta * FAKD_SA2_loss

        elif args.model == 'VKD':  #
            #Robust Re-Identification by Multiple Views Knowledge Distillation
            if args.stage == 'Block1':
                VKD_Similarity1_loss = other.VKD_SimilarityDistillationLoss(
                ).cuda()(rb1_t, rb1_s)
                VKD_OnlineTriplet1_loss = other.VKD_OnlineTripletLoss().cuda()(
                    rb1_s, target)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VKD_Similarity1_loss \
                                             + args.delta * VKD_OnlineTriplet1_loss
            elif args.stage == 'Block2':
                VKD_Similarity2_loss = other.VKD_SimilarityDistillationLoss(
                ).cuda()(rb2_t, rb2_s)
                VKD_OnlineTriplet2_loss = other.VKD_OnlineTripletLoss().cuda()(
                    rb2_s, target)
                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VKD_Similarity2_loss \
                                             + args.delta * VKD_OnlineTriplet2_loss
            else:
                VKD_Similarity1_loss = other.VKD_SimilarityDistillationLoss(
                ).cuda()(rb1_t, rb1_s)
                VKD_OnlineTriplet1_loss = other.VKD_OnlineTripletLoss().cuda()(
                    rb1_s, target)

                VKD_Similarity2_loss = other.VKD_SimilarityDistillationLoss(
                ).cuda()(rb2_t, rb2_s)
                VKD_OnlineTriplet2_loss = other.VKD_OnlineTripletLoss().cuda()(
                    rb2_s, target)

                loss = args.alpha * cls_loss + args.beta * kd_loss + args.gamma * VKD_Similarity1_loss \
                           + args.delta * VKD_OnlineTriplet1_loss  + args.gamma * VKD_Similarity2_loss \
                                             + args.delta * VKD_OnlineTriplet2_loss

        elif args.model == 'RAD':  # RAD:  Resolution-Adapted Distillation
            # Efficient Low-Resolution Face Recognition via Bridge Distillation
            distance = mimic_t - mimic_s
            RAD_loss = torch.pow(distance, 2).sum(dim=(0, 1), keepdim=False)
            loss = RAD_loss + cls_loss
        else:
            raise Exception('Invalid model name...')

        loss.backward()
        utils.clip_gradient(optimizer, 0.1)
        optimizer.step()
        train_loss += loss.item()
        train_cls_loss += cls_loss.item()

        if args.augmentation:
            conf_mat_a += losses.confusion_matrix(out_s, student_target_a,
                                                  NUM_CLASSES)
            acc_a = sum([conf_mat_a[i, i] for i in range(conf_mat_a.shape[0])
                         ]) / conf_mat_a.sum()
            precision_a = np.array([
                conf_mat_a[i, i] / (conf_mat_a[i].sum() + 1e-10)
                for i in range(conf_mat_a.shape[0])
            ])
            recall_a = np.array([
                conf_mat_a[i, i] / (conf_mat_a[:, i].sum() + 1e-10)
                for i in range(conf_mat_a.shape[0])
            ])
            mAP_a = sum(precision_a) / len(precision_a)
            F1_score_a = (2 * precision_a * recall_a /
                          (precision_a + recall_a + 1e-10)).mean()

            conf_mat_b += losses.confusion_matrix(out_s, student_target_b,
                                                  NUM_CLASSES)
            acc_b = sum([conf_mat_b[i, i] for i in range(conf_mat_b.shape[0])
                         ]) / conf_mat_b.sum()
            precision_b = np.array([
                conf_mat_b[i, i] / (conf_mat_b[i].sum() + 1e-10)
                for i in range(conf_mat_b.shape[0])
            ])
            recall_b = np.array([
                conf_mat_b[i, i] / (conf_mat_b[:, i].sum() + 1e-10)
                for i in range(conf_mat_b.shape[0])
            ])
            mAP_b = sum(precision_b) / len(precision_b)
            F1_score_b = (2 * precision_b * recall_b /
                          (precision_b + recall_b + 1e-10)).mean()

            acc = student_lam * acc_a + (1 - student_lam) * acc_b
            mAP = student_lam * mAP_a + (1 - student_lam) * mAP_b
            F1_score = student_lam * F1_score_a + (1 -
                                                   student_lam) * F1_score_b

        else:
            conf_mat += losses.confusion_matrix(out_s, target, NUM_CLASSES)
            acc = sum([conf_mat[i, i]
                       for i in range(conf_mat.shape[0])]) / conf_mat.sum()
            precision = [
                conf_mat[i, i] / (conf_mat[i].sum() + 1e-10)
                for i in range(conf_mat.shape[0])
            ]
            mAP = sum(precision) / len(precision)

            recall = [
                conf_mat[i, i] / (conf_mat[:, i].sum() + 1e-10)
                for i in range(conf_mat.shape[0])
            ]
            precision = np.array(precision)
            recall = np.array(recall)
            f1 = 2 * precision * recall / (precision + recall + 1e-10)
            F1_score = f1.mean()

        #utils.progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% | mAP: %.3f%% | F1: %.3f%%'
        #% (train_loss/(batch_idx+1), 100.*acc, 100.* mAP, 100.* F1_score))

    return train_cls_loss / (batch_idx +
                             1), 100. * acc, 100. * mAP, 100 * F1_score