Exemple #1
0
    def training_obj(self, train, train_target, weights, model_opt, val,
                     val_target, global_step):
        if not self.gen_error_alpha:
            logits = self.model(train, weights)
            loss = self.criterion(logits, train_target)
            accuracy = utils.accuracy(logits, train_target)[0]
            loss1, loss2 = loss, torch.zeros_like(loss)
        else:
            logits_train = self.model(train, weights)
            loss_train = self.criterion(logits_train, train_target)

            logits_val = self.model(val, weights)
            loss_val = self.criterion(logits_val, val_target)

            loss2 = torch.abs(loss_val - loss_train)
            self.loss_diff_sign.update(
                torch.mean(((loss_val - loss_train) > 0).float()).data)
            loss1 = loss_train
            loss = loss1 + self.gen_error_alpha_lambda * loss2
            accuracy = utils.accuracy(logits_train, train_target)[0]

        if self.alpha_loss:
            alpha_loss = self.alpha.module.alpha_loss(weights)
            loss += self.args.alpha_loss_lambda * alpha_loss

            if self.count % self.report_freq == 0:
                self.writer.add_scalar('meta/alpha_loss',
                                       torch.mean(alpha_loss), global_step)

        return loss, accuracy, loss1, loss2
    def evaluate(self, cfg):

        self.phase = 'test'

        # switch to evaluate mode
        self.net.eval()

        self.imgs_all = []
        self.pred_index_all = []
        self.target_index_all = []
        self.fake_image_num = 0

        with torch.no_grad():

            print('# Cls val images num = {0}'.format(self.val_image_num))
            # batch_index = int(self.val_image_num / cfg.BATCH_SIZE)
            # random_id = random.randint(0, batch_index)

            for i, data in enumerate(self.val_loader):
                self.set_input(data, self.cfg.DATA_TYPE)

                self._forward()
                self._process_fc()

                # accuracy
                prec1 = util.accuracy(self.cls.data, self.label, topk=(1, ))
                self.loss_meters['VAL_CLS_ACC'].update(prec1[0].item(),
                                                       self.batch_size)

        # Mean ACC
        mean_acc = self._cal_mean_acc(cfg=cfg, data_loader=self.val_loader)
        print('mean_acc: [{0}]'.format(mean_acc))
        return mean_acc
Exemple #3
0
def test():
    model.eval()
    output = model(features, adj)
    loss_test = F.mse_loss(output[idx_test], labels[idx_test])
    acc_test = accuracy(output, labels)
    print("Test set results:", "loss= {:.4f}".format(loss_test.data[0]),
          "accuracy= {:.4f}".format(acc_test.data[0]))
Exemple #4
0
def test(test_feature,test_label):
    model.eval()
    output = model(test_feature, adj)
    print(output.max(1)[1].data)
    print(test_label.data)
    acc_test = accuracy(output, test_label)
    loss_test = F.nll_loss(output, test_label)
    print("Test set results: loss={:.4f} test acc={:.4f}".format(loss_test.data[0],acc_test.data[0]))
    def test_step(self, batch, batch_idx):
        inp_img, target = batch
        preds = self.forward(inp_img)
        loss = self.criterrion(preds, target)
        accu = accuracy(preds, target)
        results = {'test_loss': loss,
                   'test_accuracy': accu}

        return results
    def validation_step(self, batch, batch_idx):
        inp_img, target = batch
        preds = self.forward(inp_img)
        loss = self.criterrion(preds, target)
        accu = accuracy(preds, target)[0]
        results = {'val_loss': loss,
                   'val_accuracy': accu}

        return results
Exemple #7
0
    def _construct_TRAIN_G_LOSS(self, epoch=None):

        loss_total = torch.zeros(1)
        if self.use_gpu:
            loss_total = loss_total.cuda()

        if self.gen is not None:
            assert (self.gen.size(-1) == self.cfg.FINE_SIZE)

        if 'CLS' in self.cfg.LOSS_TYPES:
            cls_loss = self.loss['cls_loss'].mean()
            loss_total = loss_total + cls_loss

            cls_loss = round(cls_loss.item(), 4)
            self.loss_meters['TRAIN_CLS_LOSS'].update(cls_loss,
                                                      self.batch_size)

            prec1 = util.accuracy(self.cls.data, self.label, topk=(1, ))
            self.loss_meters['TRAIN_CLS_ACC'].update(prec1[0].item(),
                                                     self.batch_size)

        # ) content supervised
        if self.cfg.NITER_START_CONTENT <= epoch <= self.cfg.NITER_END_CONTENT:

            if 'SEMANTIC' in self.cfg.LOSS_TYPES:
                content_loss = self.loss['content_loss'].mean()
                loss_total = loss_total + content_loss

                content_loss = round(content_loss.item(), 4)
                self.loss_meters['TRAIN_SEMANTIC_LOSS'].update(
                    content_loss, self.batch_size)

        if self.cfg.NITER_START_PIX2PIX <= epoch <= self.cfg.NITER_END_PIX2PIX:

            if 'PIX2PIX' in self.cfg.LOSS_TYPES:
                pix2pix_loss = self.loss['pix2pix_loss'].mean()
                loss_total = loss_total + pix2pix_loss

                pix2pix_loss = round(pix2pix_loss.item(), 4)
                self.loss_meters['TRAIN_PIXEL_LOSS'].update(
                    pix2pix_loss, self.batch_size)

        if self.cfg.NITER_START_GAN <= epoch <= self.cfg.NITER_END_GAN:

            if 'GAN' in self.cfg.LOSS_TYPES:
                self.forward_D(detach=False)

                loss_GAN = self.criterion_GAN(self.pred_fake,
                                              self._real) * self.cfg.ALPHA_GAN
                loss_total += loss_GAN

                loss_G_GAN = round(loss_GAN.item(), 4)
                self.loss_meters['TRAIN_G_LOSS'].update(
                    loss_G_GAN, self.batch_size)

        # total loss
        return loss_total
Exemple #8
0
def train(iteration):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    loss_train = F.nll_loss(output, labels)
    acc_train = accuracy(output, labels)
    loss_train.backward()
    optimizer.step()

    if not fastmode:
        model.eval()
        output = model(features, adj)

    los_val_ = []
    if (iteration+1) % 10 == 0:
        loss_val = F.nll_loss(output, labels)
        acc_val = accuracy(output, labels)
        los_val_.append(loss_val.data[0])
        print('Epoch:{:04d} Val loss:{:.4f} Val acc:{:.4f}'.format(epoch+1, loss_val.data[0], acc_val.data[0]))

    return loss_train.data[0],acc_train.data[0],los_val_
Exemple #9
0
    def _cal_loss(self, epoch=None):

        loss_total = torch.zeros(1)
        if self.use_gpu:
            loss_total = loss_total.cuda()

        if self.gen is not None:
            assert (self.gen.size(-1) == self.cfg.FINE_SIZE)

        if 'CLS' in self.cfg.LOSS_TYPES:
            cls_loss = self.criterion_cls(self.cls,
                                          self.label) * self.cfg.ALPHA_CLS
            loss_total = loss_total + cls_loss

            cls_loss = round(cls_loss.item(), 4)
            self.loss_meters['TRAIN_CLS_LOSS'].update(cls_loss,
                                                      self.batch_size)

            prec1 = util.accuracy(self.cls.data, self.label, topk=(1, ))
            self.loss_meters['TRAIN_CLS_ACC'].update(prec1[0].item(),
                                                     self.batch_size)

        # ) content supervised
        if self.cfg.NITER_START_CONTENT <= epoch <= self.cfg.NITER_END_CONTENT:

            if 'SEMANTIC' in self.cfg.LOSS_TYPES:
                source_features = self.content_model(
                    (self.gen + 1) / 2, layers=self.content_layers)
                target_features = self.content_model(
                    (self.target_modal + 1) / 2, layers=self.content_layers)
                len_layers = len(self.content_layers)
                loss_fns = [self.criterion_content] * len_layers
                alpha = [1] * len_layers

                layer_wise_losses = [
                    alpha[i] * loss_fns[i](source_feature, target_features[i])
                    for i, source_feature in enumerate(source_features)
                ] * self.cfg.ALPHA_CONTENT

                content_loss = sum(layer_wise_losses)
                loss_total = loss_total + content_loss

                self.loss_meters['TRAIN_SEMANTIC_LOSS'].update(
                    content_loss.item(), self.batch_size)

        # total loss
        return loss_total
    def _cal_loss(self, epoch=None):

        loss_total = torch.zeros(1)
        if self.use_gpu:
            loss_total = loss_total.cuda()

        cls_loss = self.criterion_cls(self.cls, self.label) * self.cfg.ALPHA_CLS
        loss_total = loss_total + cls_loss

        cls_loss = round(cls_loss.item(), 4)
        self.loss_meters['TRAIN_CLS_LOSS'].update(cls_loss, self.batch_size)

        prec1 = util.accuracy(self.cls.data, self.label, topk=(1,))
        self.loss_meters['TRAIN_CLS_ACC'].update(prec1[0].item(), self.batch_size)

        # total loss
        return loss_total
Exemple #11
0
def train_epoch(train_loader, net, criterion, optimizer, scheduler, cur_epoch,
                rank):

    batch_time, data_time, losses, top1, topk = utils.get_meters()
    progress = utils.ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, topk],
        prefix=" = TRAIN:     [{}]".format(cur_epoch),
    )

    lr = utils.get_epoch_lr(cur_epoch)
    utils.set_lr(optimizer, lr)

    # set sampler
    train_loader.sampler.set_epoch(cur_epoch)
    # switch to train mode
    net.train()

    end = time.time()
    for idx, (inputs, targets) in enumerate(train_loader):
        data_time.update(time.time() - end)

        inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True)
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        batch_size = inputs.size(0)
        acc_1, acc_k = utils.accuracy(outputs, targets, topk=(1, 5))
        loss, acc_1, acc_k = utils.scaled_all_reduce([loss, acc_1, acc_k])

        losses.update(loss.item(), batch_size)
        top1.update(acc_1[0], batch_size)
        topk.update(acc_k[0], batch_size)

        batch_time.update(time.time() - end)
        end = time.time()

        if rank == 0 and (idx % cfg.TRAIN.PRINT_FEQ == 0 or
                          (idx + 1) == len(train_loader)):
            progress.display(idx)
def validate(model, criterion, device, valid_loader):

    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    steps = 0
    with torch.no_grad():
        for images_batch, targets_batch in valid_loader:
            images_batch = images_batch.to(device)
            targets_batch = targets_batch.to(device)
            predicts = model(images_batch)
            loss = criterion(predicts, targets_batch)
            epoch_loss += loss.item()
            epoch_acc += accuracy(predicts, targets_batch)[0].item()
            steps += 1
        epoch_loss /= steps
        epoch_acc /= steps

    return epoch_loss, epoch_acc
def train(model, optimizer, criterion, device, train_loader, valid_loader,
          args, information, checkpoints_path):

    model.to(device)
    print('Train started...')
    locer = 0
    for epoch in tqdm(range(args.start_epoch, args.epochs)):
        print(f'Epoch {epoch+1} started')
        model.train()
        epoch_loss = 0
        epoch_acc = 0
        steps = 0
        for images_batch, targets_batch in train_loader:
            images_batch = images_batch.to(device)
            targets_batch = targets_batch.to(device)
            predicts = model(images_batch)
            loss = criterion(predicts, targets_batch)
            epoch_loss += loss.item()
            epoch_acc += accuracy(predicts, targets_batch)[0].item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            steps += 1
        epoch_loss /= steps
        epoch_acc /= steps

        val_loss, val_accuracy = validate(model, criterion, device,
                                          valid_loader)
        epoch_info = f'Epoch finished! Train loss: {epoch_loss}, ' \
                     f'Train acc: {epoch_acc}, Val loss: {val_loss}, Val acc: {val_accuracy}'
        print(epoch_info)

        information[0].loc[locer] = [
            epoch_loss, epoch_acc, val_loss, val_accuracy
        ]
        information[0].to_csv(f'info/{information[1]}.csv', index=False)
        torch.save(
            {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }, os.path.join(checkpoints_path, f'epoch_{epoch + 1}.pth'))
        locer += 1
Exemple #14
0
    def evaluate(self, cfg, epoch=None):

        self.phase = 'test'

        # switch to evaluate mode
        self.net.eval()

        self.imgs_all = []
        self.pred_index_all = []
        self.target_index_all = []

        with torch.no_grad():

            print('# Cls val images num = {0}'.format(self.val_image_num))
            # batch_index = int(self.val_image_num / cfg.BATCH_SIZE)
            # random_id = random.randint(0, batch_index)

            for i, data in enumerate(self.val_loader):
                self.set_input(data, self.cfg.DATA_TYPE)

                self._forward()
                self._process_fc()

                if not cfg.INFERENCE:
                    # loss
                    if self.loss['cls_loss'] is not None:
                        cls_loss = self.loss['cls_loss'].mean()
                        self.loss_meters['VAL_CLS_LOSS'].update(
                            round(cls_loss.item(), 4), self.batch_size)

                # accuracy
                prec1 = util.accuracy(self.cls.data, self.label, topk=(1, ))
                self.loss_meters['VAL_CLS_ACC'].update(prec1[0].item(),
                                                       self.batch_size)

        # Mean ACC
        mean_acc = self._cal_mean_acc(cfg=cfg, data_loader=self.val_loader)
        print('mean_acc:', mean_acc)
        return mean_acc
def test(model, criterion, device, test_loader, args, information,
         checkpoints_path):

    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    steps = 0
    with torch.no_grad():
        for images_batch, targets_batch in tqdm(test_loader):
            images_batch = images_batch.to(device)
            targets_batch = targets_batch.to(device)
            predicts = model(images_batch)
            loss = criterion(predicts, targets_batch)
            epoch_loss += loss.item()
            epoch_acc += accuracy(predicts, targets_batch)[0].item()
            steps += 1
        epoch_loss /= steps
        epoch_acc /= steps

    print(f'Test finished! Test loss: {epoch_loss}, Test acc: {epoch_acc}')
    information[0].loc[0] = [epoch_loss, epoch_acc]
    information[0].to_csv(f'test_info/{information[1]}.csv', index=False)
Exemple #16
0
def validate(val_loader, net, criterion, cur_epoch, rank):

    batch_time, data_time, losses, top1, topk = utils.get_meters()
    progress = utils.ProgressMeter(
        len(val_loader),
        [batch_time, data_time, losses, top1, topk],
        prefix=" = VAL:     [{}]".format(cur_epoch),
    )

    # switch to evaluate mode
    net.eval()
    with torch.no_grad():
        end = time.time()
        for idx, (inputs, targets) in enumerate(val_loader):
            data_time.update(time.time() - end)

            inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True)
            outputs = net(inputs)

            loss = criterion(outputs, targets)

            acc_1, acc_k = utils.accuracy(outputs, targets, topk=(1, 5))
            loss, acc_1, acc_k = utils.scaled_all_reduce([loss, acc_1, acc_k])

            batch_size = inputs.size(0)
            losses.update(loss.item(), batch_size)
            top1.update(acc_1[0], batch_size)
            topk.update(acc_k[0], batch_size)
            batch_time.update(time.time() - end)
            end = time.time()

            if rank == 0 and (idx % cfg.TEST.PRINT_FEQ == 0 or
                              (idx + 1) == len(val_loader)):
                progress.display(idx)

    return top1.avg, topk.avg
def OneEpoch(epoch, train_loader, OPTIMIZER, DISP_FREQ, NUM_EPOCH_WARM_UP, NUM_BATCH_WARM_UP):
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    batch = 0
#iterator = iter(train_loader)
    start = time.time()
    for inputs, labels in train_loader:
        if (epoch + 1 <= NUM_EPOCH_WARM_UP) and (batch + 1 <= NUM_BATCH_WARM_UP): # adjust LR for each training batch during warm up
            warm_up_lr(batch + 1, NUM_BATCH_WARM_UP, LR, OPTIMIZER)

        # compute output
        inputs = inputs.to(DEVICE, non_blocking=True)
        labels = labels.to(DEVICE, non_blocking=True).long()
        features = BACKBONE(inputs)
        outputs = HEAD(features, labels)
        loss = LOSS(outputs, labels)
    
        # measure accuracy and record loss
        prec1, prec5 = accuracy(outputs.data, labels, topk = (1, 5))
        losses.update(loss.data.item(), inputs.size(0))
        top1.update(prec1.data.item(), inputs.size(0))
        top5.update(prec5.data.item(), inputs.size(0))
    
        # compute gradient and do SGD step
        OPTIMIZER.zero_grad()
        loss.backward()
        OPTIMIZER.step()
                
                # dispaly training loss & acc every DISP_FREQ
        if ((batch + 1) % DISP_FREQ == 0) and batch != 0:
            print("=" * 60)
            print('Epoch {}/{} Batch {}/{}\t'
                 'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                 'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                 'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                 epoch + 1, NUM_EPOCH, batch + 1, len(train_loader) * NUM_EPOCH, loss = losses, top1 = top1, top5 = top5))
            print("Running speed in the last 100 batches: {:.3f} iter/s.".format(DISP_FREQ / (time.time() - start)))
            start = time.time()
            print("=" * 60)
        batch += 1

    epoch_loss = losses.avg
    epoch_acc = top1.avg
    writer.add_scalar("Training_Loss", epoch_loss, epoch + 1)
    writer.add_scalar("Training_Accuracy", epoch_acc, epoch + 1)
    print("=" * 60)
    print('Epoch: {}/{}\t'
        'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
        'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
        'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
        epoch + 1, NUM_EPOCH, loss = losses, top1 = top1, top5 = top5))
    print("=" * 60)
    # perform validation & save checkpoints per epoch
    # validation statistics per epoch (buffer for visualization)
    print("=" * 60)
    print("Perform Evaluation on LFW, CFP_FF, CFP_FP, AgeDB, CALFW, CPLFW and VGG2_FP, and Save Checkpoints...")
    accuracy_lfw, best_threshold_lfw, roc_curve_lfw = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, lfw, lfw_issame)
    buffer_val(writer, "LFW", accuracy_lfw, best_threshold_lfw, roc_curve_lfw, epoch + 1)
#		accuracy_cfp_ff, best_threshold_cfp_ff, roc_curve_cfp_ff = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, cfp_ff, cfp_ff_issame)
#		buffer_val(writer, "CFP_FF", accuracy_cfp_ff, best_threshold_cfp_ff, roc_curve_cfp_ff, epoch + 1)
#		accuracy_cfp_fp, best_threshold_cfp_fp, roc_curve_cfp_fp = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, cfp_fp, cfp_fp_issame)
#		buffer_val(writer, "CFP_FP", accuracy_cfp_fp, best_threshold_cfp_fp, roc_curve_cfp_fp, epoch + 1)
#		accuracy_agedb, best_threshold_agedb, roc_curve_agedb = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, agedb, agedb_issame)
#		buffer_val(writer, "AgeDB", accuracy_agedb, best_threshold_agedb, roc_curve_agedb, epoch + 1)
#		accuracy_calfw, best_threshold_calfw, roc_curve_calfw = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, calfw, calfw_issame)
#		buffer_val(writer, "CALFW", accuracy_calfw, best_threshold_calfw, roc_curve_calfw, epoch + 1)
#		accuracy_cplfw, best_threshold_cplfw, roc_curve_cplfw = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, cplfw, cplfw_issame)
#		buffer_val(writer, "CPLFW", accuracy_cplfw, best_threshold_cplfw, roc_curve_cplfw, epoch + 1)
    accuracy_vgg2_fp, best_threshold_vgg2_fp, roc_curve_vgg2_fp = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, vgg2_fp, vgg2_fp_issame)
    buffer_val(writer, "VGGFace2_FP", accuracy_vgg2_fp, best_threshold_vgg2_fp, roc_curve_vgg2_fp, epoch + 1)
    print("=" * 60)

    # save checkpoints per epoch
    if MULTI_GPU:
        torch.save(BACKBONE.module.state_dict(), os.path.join(MODEL_ROOT, "Backbone_{}_Epoch_{}_Batch_{}_Time_{}_checkpoint.pth".format(BACKBONE_NAME, epoch + 1, batch, get_time())))
        torch.save(HEAD.state_dict(), os.path.join(MODEL_ROOT, "Head_{}_Epoch_{}_Batch_{}_Time_{}_checkpoint.pth".format(HEAD_NAME, epoch + 1, batch, get_time())))
    else:
        torch.save(BACKBONE.state_dict(), os.path.join(MODEL_ROOT, "Backbone_{}_Epoch_{}_Batch_{}_Time_{}_checkpoint.pth".format(BACKBONE_NAME, epoch + 1, batch, get_time())))
        torch.save(HEAD.state_dict(), os.path.join(MODEL_ROOT, "Head_{}_Epoch_{}_Batch_{}_Time_{}_checkpoint.pth".format(HEAD_NAME, epoch + 1, batch, get_time())))
                model[index].nfe = 0
            nfe_backward = nfe_backward / len(odelayer_indexes)
            logger.info(f'nfe_backward is: {nfe_backward}')

        batch_time_meter.update(time.time() - end)
        if is_odenet:
            f_nfe_meter.update(nfe_forward)
            b_nfe_meter.update(nfe_backward)
        end = time.time()

        if iterations % batches_per_epoch == 0:
            train_loss /= batches_per_epoch
            epoch += 1

            with torch.no_grad():
                val_acc = accuracy(model, test_loader, args)
                logger.info(
                    "Epoch {:04d} | Time {:.3f} ({:.3f}) | NFE-F {:.1f} | NFE-B {:.1f} | "
                    "Test Acc {:.4f} | Training Loss {:.4f}".format(
                        iterations // batches_per_epoch, batch_time_meter.val,
                        batch_time_meter.avg, f_nfe_meter.avg, b_nfe_meter.avg,
                        val_acc, train_loss))

            writer.writerow(
                [f'{epoch}', f'{iterations}', f'{train_loss}', f'{val_acc}'])

            csv_file.flush()

            train_loss = 0.

            # Save state to file
Exemple #19
0
                    0]:  # adjust LR for each training stage after warm up, you can also choose to adjust LR manually (with slight modification) once plaueau observed
                schedule_lr(OPTIMIZER)
            if batch == STAGES[1]:
                schedule_lr(OPTIMIZER)
            if batch == STAGES[2]:
                schedule_lr(OPTIMIZER)

            # compute output
            inputs = inputs.to(DEVICE)
            labels = labels.to(DEVICE).long()
            features = BACKBONE(inputs)
            outputs = HEAD(features, labels)
            loss = LOSS(outputs, labels)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(outputs.data, labels, topk=(1, 5))
            losses.update(loss.data.item(), inputs.size(0))
            top1.update(prec1.data.item(), inputs.size(0))
            top5.update(prec5.data.item(), inputs.size(0))

            # compute gradient and do SGD step
            OPTIMIZER.zero_grad()
            loss.backward()
            OPTIMIZER.step()

            # dispaly training loss & acc every DISP_FREQ
            if batch % 2000 == 0 and batch != 0:
                print("=" * 60)
                print('Epoch {}/{} Batch {}/{}\t'
                      'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
Exemple #20
0
def train(train_loader, backbone, head, criterion, optimizer, epoch, cfg,
          writer):
    DISP_FREQ = 100  # 100 batch
    batch = 0  # batch index
    backbone.train()  # set to training mode
    head.train()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    for inputs, labels in tqdm(iter(train_loader)):
        # compute output
        start_time = time.time()
        inputs = inputs.cuda(cfg['GPU'], non_blocking=True)
        labels = labels.cuda(cfg['GPU'], non_blocking=True)
        features, conv_features = backbone(inputs)

        outputs, original_logits = head(features, labels)
        loss = criterion(outputs, labels)
        end_time = time.time()
        duration = end_time - start_time
        if ((batch + 1) % DISP_FREQ == 0) and batch != 0:
            print("batch inference time", duration)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure accuracy and record loss
        prec1, prec5 = accuracy(original_logits.data, labels, topk=(1, 5))
        losses.update(loss.data.item(), inputs.size(0))
        top1.update(prec1.data.item(), inputs.size(0))
        top5.update(prec5.data.item(), inputs.size(0))
        # dispaly training loss & acc every DISP_FREQ
        if ((batch + 1) % DISP_FREQ == 0) or batch == 0:
            print("=" * 60)
            print('Epoch {}/{} Batch {}/{}\t'
                  'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                      epoch + 1,
                      cfg['NUM_EPOCH'],
                      batch + 1,
                      len(train_loader),
                      loss=losses,
                      top1=top1,
                      top5=top5))
            print("=" * 60)
        sys.stdout.flush()
        batch += 1  # batch index
    epoch_loss = losses.avg
    epoch_acc = top1.avg
    print("=" * 60)
    print('Epoch: {}/{}\t'
          'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
          'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
          'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
              epoch + 1, cfg['NUM_EPOCH'], loss=losses, top1=top1, top5=top5))
    sys.stdout.flush()
    print("=" * 60)
    if cfg['RANK'] == 0:
        writer.add_scalar("Training_Loss", epoch_loss, epoch + 1)
        writer.add_scalar("Training_Accuracy", epoch_acc, epoch + 1)
        writer.add_scalar("Top1", top1.avg, epoch + 1)
        writer.add_scalar("Top5", top5.avg, epoch + 1)
Exemple #21
0
def main():
    args = run_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_infer:
        raise ValueError(
            "At least one of `do_train` or `do_infer` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    processor = ColaProcessor()
    label_list = processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    if args.upper_model == "Linear":
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)
    elif args.upper_model == "CNN":
        model = BertCnn.from_pretrained(args.bert_model,
                                        num_labels=num_labels,
                                        seq_len=args.max_seq_length)
    else:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                predictions = model(input_ids, segment_ids, input_mask,
                                    label_ids)
                for i in range(len(predictions)):
                    predictions[i] = predictions[i].view(-1, num_labels)
                loss_fct = CrossEntropyLoss()
                loss_fct_parallel = DataParallelCriterion(loss_fct)
                loss = loss_fct_parallel(predictions, label_ids.view(-1))
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                # do eval
                if global_step % args.eval_freq == 0 and global_step > 0:
                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)
                    all_input_ids = torch.tensor(
                        [f.input_ids for f in eval_features], dtype=torch.long)
                    all_input_mask = torch.tensor(
                        [f.input_mask for f in eval_features],
                        dtype=torch.long)
                    all_segment_ids = torch.tensor(
                        [f.segment_ids for f in eval_features],
                        dtype=torch.long)
                    all_label_ids = torch.tensor(
                        [f.label_id for f in eval_features], dtype=torch.long)
                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label_ids)
                    # Run prediction for full data
                    eval_sampler = SequentialSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0

                    for input_ids, input_mask, segment_ids, label_ids in tqdm(
                            eval_dataloader, desc="Evaluating"):
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            eval_preds = model(input_ids, segment_ids,
                                               input_mask, label_ids)

                        # 计算loss
                        for i in range(len(eval_preds)):
                            eval_preds[i] = eval_preds[i].view(-1, num_labels)
                        loss = loss_fct_parallel(eval_preds,
                                                 label_ids.view(-1))
                        if n_gpu > 1:
                            loss = loss.mean(
                            )  # mean() to average on multi-gpu.
                        if args.gradient_accumulation_steps > 1:
                            loss = loss / args.gradient_accumulation_steps
                        tmp_eval_loss = loss

                        eval_preds = torch.cat(
                            eval_preds)  # shape: [batch_size, num_labels]
                        logits = eval_preds.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        tmp_eval_accuracy = accuracy(logits, label_ids)

                        eval_loss += tmp_eval_loss.mean().item()
                        eval_accuracy += tmp_eval_accuracy

                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = eval_accuracy / nb_eval_examples
                    loss = tr_loss / nb_tr_steps if args.do_train else None
                    result = {
                        'eval_loss': eval_loss,
                        'eval_accuracy': eval_accuracy,
                        'global_step': global_step,
                        'loss': loss
                    }
                    logger.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))

    if args.do_train:
        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

    if args.do_infer:
        infer_examples = processor.get_infer_examples(args.data_dir)
        infer_features = convert_examples_to_features(infer_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running Inference *****")
        logger.info("  Num examples = %d", len(infer_examples))
        logger.info("  Batch size = %d", args.infer_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in infer_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in infer_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in infer_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in infer_features],
                                     dtype=torch.long)
        infer_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        # Run prediction for full data
        infer_sampler = SequentialSampler(infer_data)
        infer_dataloader = DataLoader(infer_data,
                                      sampler=infer_sampler,
                                      batch_size=args.infer_batch_size)

        model.eval()

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                infer_dataloader, desc="Inference"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                infer_preds = model(input_ids, segment_ids, input_mask,
                                    label_ids)

            for i in range(len(infer_preds)):
                infer_preds[i] = infer_preds[i].view(-1, num_labels)

            infer_preds = torch.cat(
                infer_preds)  # shape: [batch_size, num_labels]
            logits = infer_preds.detach().cpu().numpy()
            outputs = np.argmax(logits, axis=1)
            print(outputs)
        logger.info("***** Infer finished *****")
                0]:  # adjust LR for each training stage after warm up, you can also choose to adjust LR manually (with slight modification) once plaueau observed
                schedule_lr(OPTIMIZER)
            if batch == STAGES[1]:
                schedule_lr(OPTIMIZER)
            if batch == STAGES[2]:
                schedule_lr(OPTIMIZER)

            # compute output
            inputs = inputs.to(DEVICE)
            labels = labels.to(DEVICE).long()
            features = BACKBONE(inputs)
            outputs = HEAD(features, labels)
            loss = LOSS(outputs, labels)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(outputs.data, labels, topk=(1, 5))
            losses.update(loss.data.item(), inputs.size(0))
            top1.update(prec1.data.item(), inputs.size(0))
            top5.update(prec5.data.item(), inputs.size(0))

            # compute gradient and do SGD step
            OPTIMIZER.zero_grad()
            loss.backward()
            OPTIMIZER.step()

            # dispaly training loss & acc every DISP_FREQ
            if batch % 2000 == 0 and batch != 0:
                print("=" * 60)
                print('Epoch {}/{} Batch {}/{}\t'
                      'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
def test(features, adj, labels, index=range(features.shape[0])):
    model.eval()
    output = model(features, adj)
    loss = F.nll_loss(output[index], labels[index])
    acc = utils.accuracy(output[index], labels[index])
    return loss, acc