Example #1
0
def predicate(data_loader, feature_extractor, output_path=None):
    batch_time = AverageMeter("Time", ":6.3f", write_val=False)
    model = feature_extractor.model
    outputs_dict = dict()

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        toc = time.time()
        for batch_ind, (input, _) in enumerate(data_loader):
            input = input.cuda(non_blocking=True)

            # forward to get intermediate outputs
            _ = model(input)

            # synchronize so that everything is calculated
            torch.cuda.synchronize()

            # print(feature_extractor.target_outputs)
            for target_layer, target_output in feature_extractor.target_outputs.items(
            ):
                if target_layer in outputs_dict:
                    outputs_dict[target_layer].append(
                        target_output.data.numpy())
                else:
                    outputs_dict[target_layer] = [target_output.data.numpy()]

            # measure elapsed time
            batch_time.update(time.time() - toc)
            toc = time.time()

            if batch_ind % 10 == 0:
                print('Predicate: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'.
                      format(batch_ind,
                             len(data_loader),
                             batch_time=batch_time))

    if output_path is not None:
        # def _squeeze_dict(d):
        #     for key, val in d.items():
        #         d[key] = np.concatenate(val, 0)
        #     return d

        # outputs_dict = _squeeze_dict(outputs_dict)
        # np.savez_compressed(output_path, **outputs_dict)

        for key, val in outputs_dict.items():
            np.save(output_path + '_' + key, np.concatenate(val, 0))
            print(key, 'saved')
Example #2
0
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch,
          args, writer):
    print(
        " ->->->->->->->->->-> One epoch with Natural training <-<-<-<-<-<-<-<-<-<-"
    )

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch),
    )

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(
        train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print("Pixel range for training images : [{}, {}]".format(
                torch.min(images).data.cpu().numpy(),
                torch.max(images).data.cpu().numpy(),
            ))

        output = model(images)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(writer, "train",
                                          epoch * len(train_loader) + i)

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0:len(images) // 4]),
            )
Example #3
0
def main_worker(args):
    train, validate, modifier = get_trainer(args)

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # create model and optimizer
    model = get_model(args)
    model = set_gpu(args, model)
    wandb.watch(model)

    if args.pretrained:
        pretrained(args, model)

    optimizer = get_optimizer(args, model)
    data = get_dataset(args)
    lr_policy = get_policy(args.lr_policy)(optimizer, args)

    if args.label_smoothing is None:
        criterion = nn.CrossEntropyLoss().cuda()
    else:
        criterion = LabelSmoothing(smoothing=args.label_smoothing)

    # optionally resume from a checkpoint
    best_acc1 = 0.0
    best_acc5 = 0.0
    best_train_acc1 = 0.0
    best_train_acc5 = 0.0

    if args.resume:
        best_acc1 = resume(args, model, optimizer)

    # Data loading code
    if args.evaluate:
        acc1, acc5 = validate(data.val_loader,
                              model,
                              criterion,
                              args,
                              writer=None,
                              epoch=args.start_epoch)

        return

    # Set up directories
    run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args)
    args.ckpt_base_dir = ckpt_base_dir

    writer = SummaryWriter(log_dir=log_base_dir)
    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)
    progress_overall = ProgressMeter(1,
                                     [epoch_time, validation_time, train_time],
                                     prefix="Overall Timing")

    end_epoch = time.time()
    args.start_epoch = args.start_epoch or 0
    acc1 = None

    # Save the initial state
    save_checkpoint(
        {
            "epoch": 0,
            "arch": args.arch,
            "state_dict": model.state_dict(),
            "best_acc1": best_acc1,
            "best_acc5": best_acc5,
            "best_train_acc1": best_train_acc1,
            "best_train_acc5": best_train_acc5,
            "optimizer": optimizer.state_dict(),
            "curr_acc1": acc1 if acc1 else "Not evaluated",
        },
        False,
        filename=ckpt_base_dir / f"initial.state",
        save=False,
    )

    # Start training
    for epoch in range(args.start_epoch, args.epochs):
        lr_policy(epoch, iteration=None)
        modifier(args, epoch, model)

        cur_lr = get_lr(optimizer)

        # train for one epoch
        start_train = time.time()
        train_acc1, train_acc5 = train(data.train_loader,
                                       model,
                                       criterion,
                                       optimizer,
                                       epoch,
                                       args,
                                       writer=writer)
        train_time.update((time.time() - start_train) / 60)

        # evaluate on validation set
        start_validation = time.time()
        acc1, acc5 = validate(data.val_loader, model, criterion, args, writer,
                              epoch)
        validation_time.update((time.time() - start_validation) / 60)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        best_acc5 = max(acc5, best_acc5)
        best_train_acc1 = max(train_acc1, best_train_acc1)
        best_train_acc5 = max(train_acc5, best_train_acc5)

        save = ((epoch % args.save_every) == 0) and args.save_every > 0
        if is_best or save or epoch == args.epochs - 1:
            if is_best:
                print(
                    f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}"
                )

            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "arch": args.arch,
                    "state_dict": model.state_dict(),
                    "best_acc1": best_acc1,
                    "best_acc5": best_acc5,
                    "best_train_acc1": best_train_acc1,
                    "best_train_acc5": best_train_acc5,
                    "optimizer": optimizer.state_dict(),
                    "curr_acc1": acc1,
                    "curr_acc5": acc5,
                },
                is_best,
                filename=ckpt_base_dir / f"epoch_{epoch}.state",
                save=save,
            )
            wandb.log({
                "curr_acc1": acc1,
                "curr_acc5": acc5,
            })

        epoch_time.update((time.time() - end_epoch) / 60)
        progress_overall.display(epoch)
        progress_overall.write_to_tensorboard(writer,
                                              prefix="diagnostics",
                                              global_step=epoch)

        if args.conv_type == "SampleSubnetConv":
            count = 0
            sum_pr = 0.0
            for n, m in model.named_modules():
                if isinstance(m, SampleSubnetConv):
                    # avg pr across 10 samples
                    pr = 0.0
                    for _ in range(10):
                        pr += ((torch.rand_like(m.clamped_scores) >=
                                m.clamped_scores).float().mean().item())
                    pr /= 10.0
                    writer.add_scalar("pr/{}".format(n), pr, epoch)
                    sum_pr += pr
                    count += 1

            args.prune_rate = sum_pr / count
            writer.add_scalar("pr/average", args.prune_rate, epoch)

        writer.add_scalar("test/lr", cur_lr, epoch)
        end_epoch = time.time()

    write_result_to_csv(
        best_acc1=best_acc1,
        best_acc5=best_acc5,
        best_train_acc1=best_train_acc1,
        best_train_acc5=best_train_acc5,
        prune_rate=args.prune_rate,
        curr_acc1=acc1,
        curr_acc5=acc5,
        base_config=args.config,
        name=args.name,
    )
Example #4
0
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch,
          args, writer):
    print(
        " ->->->->->->->->->-> One epoch with Adversarial training (TRADES) <-<-<-<-<-<-<-<-<-<-"
    )

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch),
    )

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(
        train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training data
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print(
                f"Training images range: {[torch.min(images), torch.max(images)]}"
            )

        output = model(images)

        # calculate robust loss
        loss = pgd_loss(
            model=model,
            x_natural=images,
            y=target,
            device=device,
            optimizer=optimizer,
            step_size=args.step_size,
            epsilon=args.epsilon,
            perturb_steps=args.num_steps,
            beta=args.beta,
            clip_min=args.clip_min,
            clip_max=args.clip_max,
            distance=args.distance,
        )

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(writer, "train",
                                          epoch * len(train_loader) + i)

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0:len(images) // 4]),
            )
Example #5
0
def validate(val_loader, model, criterion, args, writer, epoch):
    batch_time = AverageMeter("Time", ":6.3f", write_val=False)
    losses = AverageMeter("Loss", ":.3f", write_val=False)
    top1 = AverageMeter("Acc@1", ":6.2f", write_val=False)
    top5 = AverageMeter("Acc@5", ":6.2f", write_val=False)
    progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5],
                             prefix="Test: ")

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in tqdm.tqdm(enumerate(val_loader),
                                             ascii=True,
                                             total=len(val_loader)):
            if args.gpu is not None:
                images = images.cuda(args.gpu, non_blocking=True)

            target = target.cuda(args.gpu, non_blocking=True).long()

            # compute output
            output = model(images)

            loss = criterion(output, target.view(-1))

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1.item(), images.size(0))
            top5.update(acc5.item(), images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)

        progress.display(len(val_loader))

        if writer is not None:
            progress.write_to_tensorboard(writer,
                                          prefix="test",
                                          global_step=epoch)

    return top1.avg, top5.avg
Example #6
0
def train(args):
    writer = LogWriter(logdir=args.logdir)

    rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
    world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))

    gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
    place = paddle.CUDAPlace(gpu_id)

    if world_size > 1:
        import paddle.distributed.fleet as fleet
        from .utils.data_parallel import sync_gradients, sync_params

        strategy = fleet.DistributedStrategy()
        strategy.without_graph_optimization = True
        fleet.init(is_collective=True, strategy=strategy)

    if args.use_synthetic_dataset:
        trainset = SyntheticDataset(args.num_classes, fp16=args.fp16)
    else:
        trainset = CommonDataset(root_dir=args.data_dir,
                                 label_file=args.label_file,
                                 fp16=args.fp16,
                                 is_bin=args.is_bin)

    num_image = len(trainset)
    total_batch_size = args.batch_size * world_size
    steps_per_epoch = num_image // total_batch_size
    if args.train_unit == 'epoch':
        warmup_steps = steps_per_epoch * args.warmup_num
        total_steps = steps_per_epoch * args.train_num
        decay_steps = [x * steps_per_epoch for x in args.decay_boundaries]
        total_epoch = args.train_num
    else:
        warmup_steps = args.warmup_num
        total_steps = args.train_num
        decay_steps = [x for x in args.decay_boundaries]
        total_epoch = (total_steps + steps_per_epoch - 1) // steps_per_epoch

    if rank == 0:
        logging.info('world_size: {}'.format(world_size))
        logging.info('total_batch_size: {}'.format(total_batch_size))
        logging.info('warmup_steps: {}'.format(warmup_steps))
        logging.info('steps_per_epoch: {}'.format(steps_per_epoch))
        logging.info('total_steps: {}'.format(total_steps))
        logging.info('total_epoch: {}'.format(total_epoch))
        logging.info('decay_steps: {}'.format(decay_steps))

    base_lr = total_batch_size * args.lr / 512
    lr_scheduler = paddle.optimizer.lr.PiecewiseDecay(
        boundaries=decay_steps,
        values=[
            base_lr * (args.lr_decay**i) for i in range(len(decay_steps) + 1)
        ])
    if warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
            lr_scheduler, warmup_steps, 0, base_lr)

    if args.fp16:
        paddle.set_default_dtype("float16")

    margin_loss_params = eval("losses.{}".format(args.loss))()
    backbone = eval("backbones.{}".format(args.backbone))(
        num_features=args.embedding_size, dropout=args.dropout)
    classifier = eval("classifiers.{}".format(args.classifier))(
        rank=rank,
        world_size=world_size,
        num_classes=args.num_classes,
        margin1=margin_loss_params.margin1,
        margin2=margin_loss_params.margin2,
        margin3=margin_loss_params.margin3,
        scale=margin_loss_params.scale,
        sample_ratio=args.sample_ratio,
        embedding_size=args.embedding_size,
        fp16=args.fp16)

    backbone.train()
    classifier.train()

    optimizer = paddle.optimizer.Momentum(parameters=[{
        'params':
        backbone.parameters(),
    }, {
        'params':
        classifier.parameters(),
    }],
                                          learning_rate=lr_scheduler,
                                          momentum=args.momentum,
                                          weight_decay=args.weight_decay)

    if args.fp16:
        optimizer._dtype = 'float32'

    if world_size > 1:
        # sync backbone params for data parallel
        sync_params(backbone.parameters())

    if args.do_validation_while_train:
        callback_verification = CallBackVerification(
            args.validation_interval_step,
            rank,
            args.batch_size,
            args.val_targets,
            args.data_dir,
            fp16=args.fp16,
        )

    callback_logging = CallBackLogging(args.log_interval_step, rank,
                                       world_size, total_steps,
                                       args.batch_size, writer)

    checkpoint = Checkpoint(
        rank=rank,
        world_size=world_size,
        embedding_size=args.embedding_size,
        num_classes=args.num_classes,
        model_save_dir=os.path.join(args.output, args.backbone),
        checkpoint_dir=args.checkpoint_dir,
        max_num_last_checkpoint=args.max_num_last_checkpoint)

    start_epoch = 0
    global_step = 0
    loss_avg = AverageMeter()
    if args.resume:
        extra_info = checkpoint.load(backbone,
                                     classifier,
                                     optimizer,
                                     for_train=True)
        start_epoch = extra_info['epoch'] + 1
        lr_state = extra_info['lr_state']
        # there last_epoch means last_step in for PiecewiseDecay
        # since we always use step style for lr_scheduler
        global_step = lr_state['last_epoch']
        lr_scheduler.set_state_dict(lr_state)

    train_loader = paddle.io.DataLoader(
        trainset,
        places=place,
        num_workers=args.num_workers,
        batch_sampler=paddle.io.DistributedBatchSampler(
            dataset=trainset,
            batch_size=args.batch_size,
            shuffle=True,
            drop_last=True))

    scaler = LSCGradScaler(
        enable=args.fp16,
        init_loss_scaling=args.init_loss_scaling,
        incr_ratio=args.incr_ratio,
        decr_ratio=args.decr_ratio,
        incr_every_n_steps=args.incr_every_n_steps,
        decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
        use_dynamic_loss_scaling=args.use_dynamic_loss_scaling)

    for epoch in range(start_epoch, total_epoch):
        train_reader_cost = 0.0
        train_run_cost = 0.0
        total_samples = 0
        reader_start = time.time()
        for step, (img, label) in enumerate(train_loader):
            train_reader_cost += time.time() - reader_start
            global_step += 1
            train_start = time.time()
            with paddle.amp.auto_cast(enable=args.fp16):
                features = backbone(img)
                loss_v = classifier(features, label)

            scaler.scale(loss_v).backward()
            if world_size > 1:
                # data parallel sync backbone gradients
                sync_gradients(backbone.parameters())

            scaler.step(optimizer)
            classifier.step(optimizer)
            optimizer.clear_grad()
            classifier.clear_grad()

            train_run_cost += time.time() - train_start
            total_samples += len(img)

            lr_value = optimizer.get_lr()
            loss_avg.update(loss_v.item(), 1)
            callback_logging(
                global_step,
                loss_avg,
                epoch,
                lr_value,
                avg_reader_cost=train_reader_cost / args.log_interval_step,
                avg_batch_cost=(train_reader_cost + train_run_cost) /
                args.log_interval_step,
                avg_samples=total_samples / args.log_interval_step,
                ips=total_samples / (train_reader_cost + train_run_cost))

            if args.do_validation_while_train:
                callback_verification(global_step, backbone)
            lr_scheduler.step()

            if global_step >= total_steps:
                break
            sys.stdout.flush()
            if rank is 0 and global_step > 0 and global_step % args.log_interval_step == 0:
                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0
            reader_start = time.time()
        checkpoint.save(backbone,
                        classifier,
                        optimizer,
                        epoch=epoch,
                        for_train=True)
    writer.close()
Example #7
0
def train(train_loader, model, criterion, optimizer, epoch, args, writer):
    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    top1 = AverageMeter("Acc@1", ":6.2f")
    top5 = AverageMeter("Acc@5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    batch_size = train_loader.batch_size
    num_batches = len(train_loader)
    end = time.time()
    for i, (images, target) in tqdm.tqdm(enumerate(train_loader),
                                         ascii=True,
                                         total=len(train_loader)):
        # measure data loading time
        data_time.update(time.time() - end)

        if args.gpu is not None:
            images = images.cuda(args.gpu, non_blocking=True)

        target = target.cuda(args.gpu, non_blocking=True)

        # Write scores and weights to tensorboard at beginning of every other epoch
        if args.histograms:
            if (i % (num_batches * batch_size) == 0) and (epoch % 2 == 0):
                for param_name in model.state_dict():
                    #print(param_name)
                    # Only write scores for now (not weights and batch norm parameters since the pytorch parms don't actually change)
                    #if 'score' not in param_name:
                    #if 'score' in param_name or 'weight' in param_name:
                    #print(param_name, model.state_dict()[param_name])
                    writer.add_histogram(param_name,
                                         model.state_dict()[param_name], epoch)

        # compute output
        output = model(images)

        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1.item(), images.size(0))
        top5.update(acc5.item(), images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        #torch.nn.utils.clip_grad_norm_(model.parameters(),1)
        loss.backward()
        # EDITED
        #print(torch.norm(torch.cat([p.grad.view(-1) for p in model.parameters()])))
        if args.grad_clip:
            torch.nn.utils.clip_grad_value_(model.parameters(), 1)
        #print(torch.norm(torch.cat([p.grad.view(-1) for p in model.parameters()])))
        #for param_name in model.state_dict(): print(param_name, str(model.state_dict()[param_name])[:50])
        #torch.nn.utils.clip_grad_norm_(model.parameters(),1)
        # end
        optimizer.step()

        # Clamp updated scores to [-1,1] only when using binarized/quantized activations
        #for param_name in model.state_dict():
        #  if 'score' in param_name:
        #    #print(param_name)
        #    scores = model.state_dict()[param_name]
        #    #scores = torch.clamp(scores,min=-1.0,max=1.0)
        #    scores.clamp_(min=-1.0,max=1.0)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        #print(model.state_dict()['module.linear.3.scores'].grad)
        #params = list(model.parameters())
        #print(params[1].grad)

        if i % args.print_freq == 0:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)

            #_, predicted = torch.max(output, 1)
            progress.write_to_tensorboard(writer,
                                          prefix="train",
                                          global_step=t)

        # Write score gradients to tensorboard at end of every other epoch
        if args.histograms:
            if (i % (num_batches * batch_size) == 0) and (epoch % 2 == 0):
                #if ((i+1) % (num_batches-1) == 0) and (epoch % 2 == 0):
                params = list(model.parameters())
                param_names = list(model.state_dict())
                for j in range(len(params)):
                    if params[j].grad is not None:
                        # if 'score' in param_names[j] or 'weight' in param_names[j]:
                        # if 'score' not in param_name and params[j].grad is not None:
                        #print(param_names[j])
                        #print(params[j].grad)
                        writer.add_histogram(param_names[j] + '.grad',
                                             params[j].grad, epoch)
                    else:
                        writer.add_histogram(param_names[j] + '.grad', 0,
                                             epoch)
                #for param_name in model.state_dict():
                #  if 'score' in param_name:
                #    writer.add_histogram(param_name + '.grad', model.state_dict()[param_name].grad, epoch)
                #params = list(model.parameters())
                #for j in range(len(params)):
                #  writer.add_histogram('Layer' + str(j) + 'grad', params[j].grad, epoch)

    # Write final scores and weights to tensorboard
    if args.histograms:
        for param_name in model.state_dict():
            #writer.add_histogram(param_name, model.state_dict()[param_name], epoch)
            # Only write scores for now (not weights and batch norm parameters since the pytorch parms don't actually change)
            #if 'score' not in param_name:
            #print(param_name, model.state_dict()[param_name])
            writer.add_histogram(param_name,
                                 model.state_dict()[param_name], epoch)

    return top1.avg, top5.avg
Example #8
0
def train(train_loader, model, criterion, optimizer, epoch, cfg, writer):
    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    top1 = AverageMeter("Acc@1", ":6.2f")
    top5 = AverageMeter("Acc@5", ":6.2f")
    progress = ProgressMeter(
        train_loader.num_batches,
        [batch_time, data_time, losses, top1, top5],
        cfg,
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    batch_size = train_loader.batch_size
    num_batches = train_loader.num_batches
    end = time.time()

    for i, data in enumerate(train_loader):
        # images, target = data[0]['data'],data[0]['label'].long().squeeze()
        images, target = data[0].cuda(), data[1].long().squeeze().cuda()
        # measure data loading time
        data_time.update(time.time() - end)

        if cfg.cs_kd:

            batch_size = images.size(0)
            loss_batch_size = batch_size // 2
            targets_ = target[:batch_size // 2]
            outputs = model(images[:batch_size // 2])
            loss = torch.mean(criterion(outputs, targets_))
            # loss += loss.item()

            with torch.no_grad():
                outputs_cls = model(images[batch_size // 2:])
            cls_loss = kdloss(outputs[:batch_size // 2], outputs_cls.detach())
            lamda = 3
            loss += lamda * cls_loss
            acc1, acc5 = accuracy(outputs, targets_, topk=(1, 5))
        else:
            batch_size = images.size(0)
            loss_batch_size = batch_size
            #compute output
            output = model(images)
            loss = criterion(output, target)
            acc1, acc5 = accuracy(output, target, topk=(1, 5))

        # print(i, batch_size, loss)

        # measure accuracy and record loss

        losses.update(loss.item(), loss_batch_size)
        top1.update(acc1.item(), loss_batch_size)
        top5.update(acc5.item(), loss_batch_size)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % cfg.print_freq == 0 or i == num_batches - 1:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)
            progress.write_to_tensorboard(writer,
                                          prefix="train",
                                          global_step=t)

    # train_loader.reset()
    # print(top1.count)
    return top1.avg, top5.avg
Example #9
0
def freeadv(model, device, val_loader, criterion, args, writer, epoch=0):

    assert (
        not args.normalize
    ), "Explicit normalization is done in the training loop, Dataset should have [0, 1] dynamic range."

    # Mean/Std for normalization
    mean = torch.Tensor(np.array(args.mean)[:, np.newaxis, np.newaxis])
    mean = mean.expand(3, args.image_dim, args.image_dim).to(device)
    std = torch.Tensor(np.array(args.std)[:, np.newaxis, np.newaxis])
    std = std.expand(3, args.image_dim, args.image_dim).to(device)

    batch_time = AverageMeter("Time", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix="Test: ",
    )

    eps = args.epsilon
    K = args.num_steps
    step = args.step_size
    model.eval()
    end = time.time()
    print(" PGD eps: {}, num-steps: {}, step-size: {} ".format(eps, K, step))
    for i, (input, target) in enumerate(val_loader):

        input = input.to(device, non_blocking=True)
        target = target.to(device, non_blocking=True)

        orig_input = input.clone()
        randn = torch.FloatTensor(input.size()).uniform_(-eps, eps).to(device)
        input += randn
        input.clamp_(0, 1.0)
        for _ in range(K):
            invar = Variable(input, requires_grad=True)
            in1 = invar - mean
            in1.div_(std)
            output = model(in1)
            ascend_loss = criterion(output, target)
            ascend_grad = torch.autograd.grad(ascend_loss, invar)[0]
            pert = fgsm(ascend_grad, step)
            # Apply purturbation
            input += pert.data
            input = torch.max(orig_input - eps, input)
            input = torch.min(orig_input + eps, input)
            input.clamp_(0, 1.0)

        input.sub_(mean).div_(std)
        with torch.no_grad():
            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(prec1[0], input.size(0))
            top5.update(prec5[0], input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

        if (i + 1) % args.print_freq == 0:
            progress.display(i)

        if writer:
            progress.write_to_tensorboard(writer, "test",
                                          epoch * len(val_loader) + i)

        # write a sample of test images to tensorboard (helpful for debugging)
        if i == 0 and writer:
            writer.add_image(
                "Adv-test-images",
                torchvision.utils.make_grid(input[0:len(input) // 4]),
            )

    progress.display(i)  # print final results

    return top1.avg, top5.avg
Example #10
0
def smooth(model, device, val_loader, criterion, args, writer, epoch=0):
    """
        Evaluating on unmodified validation set inputs.
    """
    batch_time = AverageMeter("Time", ":6.3f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    rad = AverageMeter("rad", ":6.2f")
    progress = ProgressMeter(len(val_loader), [batch_time, top1, top5, rad],
                             prefix="Smooth (eval): ")

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, data in enumerate(val_loader):
            images, target = data[0].to(device), data[1].to(device)

            # Defult: evaluate on 10 random samples of additive gaussian noise.
            output = []
            for _ in range(10):
                # add noise
                if args.dataset == "imagenet":
                    std = (torch.tensor([
                        0.229, 0.224, 0.225
                    ]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)).to(device)
                    noise = (torch.randn_like(images) /
                             std).to(device) * args.noise_std
                else:
                    noise = torch.randn_like(images).to(
                        device) * args.noise_std

                output.append(F.softmax(model(images + noise), -1))

            output = torch.sum(torch.stack(output), axis=0)

            p_max, _ = output.max(dim=-1)
            radii = (args.noise_std + 1e-16) * norm.ppf(
                p_max.data.cpu().numpy())

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))
            rad.update(np.mean(radii))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if (i + 1) % args.print_freq == 0:
                progress.display(i)

            if writer:
                progress.write_to_tensorboard(writer, "test",
                                              epoch * len(val_loader) + i)

            # write a sample of test images to tensorboard (helpful for debugging)
            if i == 0 and writer:
                writer.add_image(
                    "Adv-test-images",
                    torchvision.utils.make_grid(images[0:len(images) // 4]),
                )

        progress.display(i)  # print final results

    return top1.avg, rad.avg
Example #11
0
def ibp(model, device, val_loader, criterion, args, writer, epoch=0):
    batch_time = AverageMeter("Time", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    ibp_losses = AverageMeter("IBP_Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    ibp_top1 = AverageMeter("IBP-Acc_1", ":6.2f")
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, ibp_losses, top1, top5, ibp_top1],
        prefix="Test: ",
    )

    # switch to evaluation mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, data in enumerate(val_loader):
            images, target = data[0].to(device), data[1].to(device)

            # clean images

            output = model(images)
            loss = criterion(output, target)

            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            rce, rerr = naive_interval_analyze(
                model,
                args.epsilon,
                images,
                target,
                use_cuda=torch.cuda.is_available(),
                parallel=False,
            )

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            ibp_losses.update(rce.item(), images.size(0))
            ibp_top1.update((1 - rerr) * 100.0, images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if (i + 1) % args.print_freq == 0:
                progress.display(i)

            if writer:
                progress.write_to_tensorboard(writer, "test",
                                              epoch * len(val_loader) + i)

            # write a sample of test images to tensorboard (helpful for debugging)
            if i == 0 and writer:
                writer.add_image(
                    "Adv-test-images",
                    torchvision.utils.make_grid(images[0:len(images) // 4]),
                )
        progress.display(i)  # print final results

    return ibp_top1.avg, ibp_top1.avg
def train(
    model,
    device,
    train_loader,
    sm_loader,
    criterion,
    optimizer,
    epoch,
    args,
    writer=None,
):

    assert (
        not args.normalize
    ), "Explicit normalization is done in the training loop, Dataset should have [0, 1] dynamic range."

    global_noise_data = torch.zeros(
        [args.batch_size, 3, args.image_dim, args.image_dim]).to(device)

    mean = torch.Tensor(np.array(args.mean)[:, np.newaxis, np.newaxis])
    mean = mean.expand(3, args.image_dim, args.image_dim).to(device)
    std = torch.Tensor(np.array(args.std)[:, np.newaxis, np.newaxis])
    std = std.expand(3, args.image_dim, args.image_dim).to(device)

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch),
    )

    # switch to train mode
    model.train()
    for i, (input, target) in enumerate(train_loader):
        end = time.time()
        input = input.to(device, non_blocking=True)
        target = target.to(device, non_blocking=True)
        data_time.update(time.time() - end)

        for _ in range(args.n_repeats):
            # Ascend on the global noise
            noise_batch = Variable(global_noise_data[0:input.size(0)],
                                   requires_grad=True).to(device)
            in1 = input + noise_batch
            in1.clamp_(0, 1.0)
            in1.sub_(mean).div_(std)
            output = model(in1)
            loss = criterion(output, target)

            prec1, prec5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(prec1[0], input.size(0))
            top5.update(prec5[0], input.size(0))

            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()

            # Update the noise for the next iteration
            pert = fgsm(noise_batch.grad, args.epsilon)
            global_noise_data[0:input.size(0)] += pert.data
            global_noise_data.clamp_(-args.epsilon, args.epsilon)

            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)
                progress.write_to_tensorboard(writer, "train",
                                              epoch * len(train_loader) + i)

        if i == 0:
            print(
                in1.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print(f"Training images range: {[torch.min(in1), torch.max(in1)]}")

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(input[0:len(input) // 4]),
            )
Example #13
0
def main(config, exp_dir, checkpoint=None):
    torch.manual_seed(config["random_seed"])
    np.random.seed(config["random_seed"])
    random.seed(config["random_seed"])

    logger = Logger(exp_dir)

    device = torch.device("cuda" if config["use_gpu"] else "cpu")

    train_loader, val_loader = get_data_loaders(config, device)

    model = get_model(config["model_name"], **config["model_args"]).to(device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=config["learning_rate"],
                                 weight_decay=config["weight_decay"])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min")

    if "load_encoder" in config:
        encoder_model, _ = load_checkpoint(config["load_encoder"], device,
                                           get_model)
        model.encoder = encoder_model.encoder

    if checkpoint:
        logger.log("Resume training..")
        metrics = load_metrics(exp_dir)
        best_val_loss = checkpoint["best_val_loss"]
        i_episode = checkpoint["epoch"]
        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        scheduler.load_state_dict(checkpoint["scheduler"])
    else:
        i_episode = 0
        best_val_loss = float("inf")
        metrics = {
            "between_eval_time": AverageMeter(),
            "data_time": AverageMeter(),
            "batch_time": AverageMeter(),
            "train_losses": AverageMeter(),
            "train_accs": AverageMeter(),
            "val_time": AverageMeter(),
            "val_batch_time": AverageMeter(),
            "val_data_time": AverageMeter(),
            "val_losses": AverageMeter(),
            "val_accs": AverageMeter()
        }

    keep_training = True
    end = time.time()
    between_eval_end = time.time()
    while keep_training:
        for batch in train_loader:
            metrics["data_time"].update(time.time() - end)
            batch["slide"] = batch["slide"].to(device)

            model.train()
            optimizer.zero_grad()

            scores = compute_loss(config, model, batch, device)
            loss, acc = scores["loss"], scores["accuracy"]

            metrics["train_losses"].update(loss.item())
            metrics["train_accs"].update(acc)

            loss.backward()
            optimizer.step()
            metrics["batch_time"].update(time.time() - end)
            end = time.time()

            del acc
            del loss
            del batch
            if i_episode % config["eval_steps"] == 0:
                val_loss, val_acc = test(config, model, device, val_loader,
                                         metrics)
                scheduler.step(val_loss)

                metrics["between_eval_time"].update(time.time() -
                                                    between_eval_end)

                # Our optimizer has only one parameter group so the first
                # element of our list is our learning rate.
                lr = optimizer.param_groups[0]['lr']
                logger.log(
                    "Episode {0}\n"
                    "Time {metrics[between_eval_time].val:.3f} (data {metrics[data_time].val:.3f} batch {metrics[batch_time].val:.3f}) "
                    "Train loss {metrics[train_losses].val:.4e} ({metrics[train_losses].avg:.4e}) "
                    "Train acc {metrics[train_accs].val:.4f} ({metrics[train_accs].avg:.4f}) "
                    "Learning rate {lr:.2e}\n"
                    "Val time {metrics[val_time].val:.3f} (data {metrics[val_data_time].avg:.3f} batch {metrics[val_batch_time].avg:.3f}) "
                    "Val loss {metrics[val_losses].val:.4e} ({metrics[val_losses].avg:.4e}) "
                    "Val acc {metrics[val_accs].val:.4f} ({metrics[val_accs].avg:.4f}) "
                    .format(i_episode, lr=lr, metrics=metrics))

                save_metrics(metrics, exp_dir)

                is_best = val_loss < best_val_loss
                best_val_loss = val_loss if is_best else best_val_loss
                save_checkpoint(
                    {
                        "epoch": i_episode,
                        "model_name": config["model_name"],
                        "model_args": config["model_args"],
                        "state_dict": model.state_dict(),
                        "best_val_loss": best_val_loss,
                        "optimizer": optimizer.state_dict(),
                        "scheduler": scheduler.state_dict()
                    },
                    is_best,
                    path=exp_dir)
                end = time.time()
                between_eval_end = time.time()
                del val_loss
                del val_acc

            if i_episode >= config["num_episodes"]:
                keep_training = False
                break

            i_episode += 1
Example #14
0
def train(train_loader, model, criterion, optimizer, epoch, args, writer):
    # batch_time = AverageMeter("Time", ":6.3f")
    # data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    top1 = AverageMeter("Acc@1", ":6.2f")
    top5 = AverageMeter("Acc@5", ":6.2f")
    #l = [batch_time, data_time, losses, top1, top5]
    l = [losses, top1, top5]
    progress = ProgressMeter(
        len(train_loader),
        l,
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    batch_size = train_loader.batch_size
    num_batches = len(train_loader)
    end = time.time()
    image0, target0 = None, None
    for i, (images, target) in tqdm.tqdm(enumerate(train_loader),
                                         ascii=True,
                                         total=len(train_loader)):
        # if i == 0:
        image0 = images
        target0 = target
        # measure data loading time
        # data_time.update(time.time() - end)

        if args.gpu is not None:
            image0 = image0.cuda(args.gpu, non_blocking=True)

        target0 = target0.cuda(args.gpu, non_blocking=True)
        l = 0
        a1 = 0
        a5 = 0
        for j in range(args.K):
            output = model(image0)
            loss = criterion(output, target0)
            acc1, acc5 = accuracy(output, target0, topk=(1, 5))
            l = l + loss
            a1 = a1 + acc1
            a5 = a5 + acc5
        l = l / args.K
        a1 = a1 / args.K
        a5 = a5 / args.K
        # measure accuracy and record loss
        # torch.Size([128, 3, 32, 32])
        # 128
        losses.update(l.item(), image0.size(0))
        top1.update(a1.item(), images.size(0))
        top5.update(a5.item(), images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        if args.conv_type != "SFESubnetConv":
            l.backward()
        else:
            updateScoreDiff(model, l)
        # printModelScore(model, args)
        optimizer.step()

        # measure elapsed time
        # batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)
            progress.write_to_tensorboard(writer,
                                          prefix="train",
                                          global_step=t)

    return top1.avg, top5.avg
Example #15
0
File: main.py Project: zj15001/STR
def main_worker(args):
    args.gpu = None

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # create model and optimizer
    model = get_model(args)
    model = set_gpu(args, model)

    # Set up directories
    run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args)

    # Loading pretrained model
    if args.pretrained:
        pretrained(args, model)

        # Saving a DenseConv (nn.Conv2d) compatible model
        if args.dense_conv_model:
            print(
                f"==> DenseConv compatible model, saving at {ckpt_base_dir / 'model_best.pth'}"
            )
            save_checkpoint(
                {
                    "epoch": 0,
                    "arch": args.arch,
                    "state_dict": model.state_dict(),
                },
                True,
                filename=ckpt_base_dir / f"epoch_pretrained.state",
                save=True,
            )
            return

    optimizer = get_optimizer(args, model)
    data = get_dataset(args)
    lr_policy = get_policy(args.lr_policy)(optimizer, args)

    if args.label_smoothing is None:
        criterion = nn.CrossEntropyLoss().cuda()
    else:
        criterion = LabelSmoothing(smoothing=args.label_smoothing)

    # optionally resume from a checkpoint
    best_acc1 = 0.0
    best_acc5 = 0.0
    best_train_acc1 = 0.0
    best_train_acc5 = 0.0

    if args.resume:
        best_acc1 = resume(args, model, optimizer)

    # Evaulation of a model
    if args.evaluate:
        acc1, acc5 = validate(data.val_loader,
                              model,
                              criterion,
                              args,
                              writer=None,
                              epoch=args.start_epoch)
        return

    writer = SummaryWriter(log_dir=log_base_dir)
    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)
    progress_overall = ProgressMeter(1,
                                     [epoch_time, validation_time, train_time],
                                     prefix="Overall Timing")

    end_epoch = time.time()
    args.start_epoch = args.start_epoch or 0
    acc1 = None

    # Save the initial state
    save_checkpoint(
        {
            "epoch": 0,
            "arch": args.arch,
            "state_dict": model.state_dict(),
            "best_acc1": best_acc1,
            "best_acc5": best_acc5,
            "best_train_acc1": best_train_acc1,
            "best_train_acc5": best_train_acc5,
            "optimizer": optimizer.state_dict(),
            "curr_acc1": acc1 if acc1 else "Not evaluated",
        },
        False,
        filename=ckpt_base_dir / f"initial.state",
        save=False,
    )

    # Start training
    for epoch in range(args.start_epoch, args.epochs):
        lr_policy(epoch, iteration=None)
        cur_lr = get_lr(optimizer)

        # Gradual pruning in GMP experiments
        if args.conv_type == "GMPConv" and epoch >= args.init_prune_epoch and epoch <= args.final_prune_epoch:
            total_prune_epochs = args.final_prune_epoch - args.init_prune_epoch + 1
            for n, m in model.named_modules():
                if hasattr(m, 'set_curr_prune_rate'):
                    prune_decay = (
                        1 - ((args.curr_prune_epoch - args.init_prune_epoch) /
                             total_prune_epochs))**3
                    curr_prune_rate = m.prune_rate - (m.prune_rate *
                                                      prune_decay)
                    m.set_curr_prune_rate(curr_prune_rate)

        # train for one epoch
        start_train = time.time()
        train_acc1, train_acc5 = train(data.train_loader,
                                       model,
                                       criterion,
                                       optimizer,
                                       epoch,
                                       args,
                                       writer=writer)
        train_time.update((time.time() - start_train) / 60)

        # evaluate on validation set
        start_validation = time.time()
        acc1, acc5 = validate(data.val_loader, model, criterion, args, writer,
                              epoch)
        validation_time.update((time.time() - start_validation) / 60)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        best_acc5 = max(acc5, best_acc5)
        best_train_acc1 = max(train_acc1, best_train_acc1)
        best_train_acc5 = max(train_acc5, best_train_acc5)

        save = ((epoch % args.save_every) == 0) and args.save_every > 0
        if is_best or save or epoch == args.epochs - 1:
            if is_best:
                print(
                    f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}"
                )

            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "arch": args.arch,
                    "state_dict": model.state_dict(),
                    "best_acc1": best_acc1,
                    "best_acc5": best_acc5,
                    "best_train_acc1": best_train_acc1,
                    "best_train_acc5": best_train_acc5,
                    "optimizer": optimizer.state_dict(),
                    "curr_acc1": acc1,
                    "curr_acc5": acc5,
                },
                is_best,
                filename=ckpt_base_dir / f"epoch_{epoch}.state",
                save=save,
            )

        epoch_time.update((time.time() - end_epoch) / 60)
        progress_overall.display(epoch)
        progress_overall.write_to_tensorboard(writer,
                                              prefix="diagnostics",
                                              global_step=epoch)

        writer.add_scalar("test/lr", cur_lr, epoch)
        end_epoch = time.time()

        # Storing sparsity and threshold statistics for STRConv models
        if args.conv_type == "STRConv":
            count = 0
            sum_sparse = 0.0
            for n, m in model.named_modules():
                if isinstance(m, STRConv):
                    sparsity, total_params, thresh = m.getSparsity()
                    writer.add_scalar("sparsity/{}".format(n), sparsity, epoch)
                    writer.add_scalar("thresh/{}".format(n), thresh, epoch)
                    sum_sparse += int(((100 - sparsity) / 100) * total_params)
                    count += total_params
            total_sparsity = 100 - (100 * sum_sparse / count)
            writer.add_scalar("sparsity/total", total_sparsity, epoch)
        writer.add_scalar("test/lr", cur_lr, epoch)
        end_epoch = time.time()

    write_result_to_csv(
        best_acc1=best_acc1,
        best_acc5=best_acc5,
        best_train_acc1=best_train_acc1,
        best_train_acc5=best_train_acc5,
        prune_rate=args.prune_rate,
        curr_acc1=acc1,
        curr_acc5=acc5,
        base_config=args.config,
        name=args.name,
    )
    if args.conv_type == "STRConv":
        json_data = {}
        json_thres = {}
        for n, m in model.named_modules():
            if isinstance(m, STRConv):
                sparsity = m.getSparsity()
                json_data[n] = sparsity[0]
                sum_sparse += int(((100 - sparsity[0]) / 100) * sparsity[1])
                count += sparsity[1]
                json_thres[n] = sparsity[2]
        json_data["total"] = 100 - (100 * sum_sparse / count)
        if not os.path.exists("runs/layerwise_sparsity"):
            os.mkdir("runs/layerwise_sparsity")
        if not os.path.exists("runs/layerwise_threshold"):
            os.mkdir("runs/layerwise_threshold")
        with open("runs/layerwise_sparsity/{}.json".format(args.name),
                  "w") as f:
            json.dump(json_data, f)
        with open("runs/layerwise_threshold/{}.json".format(args.name),
                  "w") as f:
            json.dump(json_thres, f)
def train(logging_start_epoch, epoch, data, model, criterion, optimizer):
    """Main training procedure.
    
    Arguments:
        logging_start_epoch -- number of the first epoch to be logged
        epoch -- current epoch 
        data -- DataLoader which can provide batches for an epoch
        model -- model to be trained
        criterion -- instance of loss function to be optimized
        optimizer -- instance of optimizer which will be used for parameter updates
    """
    text_logger = logging.getLogger(__name__)

    model.train()

    # initialize counters, etc.
    learning_rate = optimizer.param_groups[0]['lr']
    cla = 0
    done, start_time = 0, time.time()

    total_loss = AverageMeter('Total Loss', ':.4e')
    mel_pre_loss = AverageMeter('Mel Pre Loss', ':.4e')
    mel_post_loss = AverageMeter('Mel Post Loss', ':.4e')
    lang_class_acc = AverageMeter('Lang Class Acc', ':.4e')
    progress = ProgressMeter(len(data),
                             total_loss,
                             mel_pre_loss,
                             mel_post_loss,
                             lang_class_acc,
                             prefix="Epoch: [{}]".format(epoch),
                             logger=text_logger)

    # loop through epoch batches
    for i, batch in enumerate(data):

        global_step = done + epoch * len(data)
        optimizer.zero_grad()

        # parse batch
        batch = list(map(to_gpu, batch))
        src, src_len, trg_mel, trg_lin, trg_len, stop_trg, spkrs, langs = batch

        # get teacher forcing ratio
        if hp.constant_teacher_forcing: tf = hp.teacher_forcing
        else:
            tf = cos_decay(
                max(global_step - hp.teacher_forcing_start_steps, 0),
                hp.teacher_forcing_steps)

        # run the model
        post_pred, pre_pred, stop_pred, alignment, spkrs_pred, enc_output = model(
            src, src_len, trg_mel, trg_len, spkrs, langs, tf)

        # evaluate loss function
        post_trg = trg_lin if hp.predict_linear else trg_mel
        classifier = model._reversal_classifier if hp.reversal_classifier else None
        loss, batch_losses = criterion(src_len, trg_len, pre_pred, trg_mel,
                                       post_pred, post_trg, stop_pred,
                                       stop_trg, alignment, spkrs, spkrs_pred,
                                       enc_output, classifier)

        total_loss.update(loss, src.size(0))
        mel_pre_loss.update(batch_losses['mel_pre'], src.size(0))
        mel_post_loss.update(batch_losses['mel_pos'], src.size(0))

        # evaluate adversarial classifier accuracy, if present
        if hp.reversal_classifier:
            input_mask = lengths_to_mask(src_len)
            trg_spkrs = torch.zeros_like(input_mask, dtype=torch.int64)
            for s in range(hp.speaker_number):
                speaker_mask = (spkrs == s)
                trg_spkrs[speaker_mask] = s
            matches = (trg_spkrs == torch.argmax(torch.nn.functional.softmax(
                spkrs_pred, dim=-1),
                                                 dim=-1))
            matches[~input_mask] = False
            cla = torch.sum(matches).item() / torch.sum(input_mask).item()
            lang_class_acc.update(cla, src.size(0))

        # comptute gradients and make a step
        loss.backward()
        gradient = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                  hp.gradient_clipping)
        optimizer.step()

        # log training progress
        if epoch >= logging_start_epoch:
            Logger.training(global_step, batch_losses, gradient, learning_rate,
                            time.time() - start_time, cla)
            progress.print(i)

        # update criterion states (params and decay of the loss and so on ...)
        criterion.update_states()

        start_time = time.time()
        done += 1
Example #17
0
def train(args):

    rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
    world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))

    gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
    place = paddle.CUDAPlace(gpu_id)

    RELATED_FLAGS_SETTING = {}
    if args.seed == 0:
        RELATED_FLAGS_SETTING['FLAGS_cudnn_deterministic'] = 1
        RELATED_FLAGS_SETTING['FLAGS_benchmark'] = 1
        args.num_workers = 0
    else:
        # args.seed == None or args.seed != 0
        RELATED_FLAGS_SETTING['FLAGS_cudnn_exhaustive_search'] = 1
        RELATED_FLAGS_SETTING['FLAGS_cudnn_batchnorm_spatial_persistent'] = 1
        RELATED_FLAGS_SETTING['FLAGS_max_inplace_grad_add'] = 8
    paddle.fluid.set_flags(RELATED_FLAGS_SETTING)

    if args.seed is not None:
        args.seed = args.seed + rank
        paddle.seed(args.seed)
        np.random.seed(args.seed)
        random.seed(args.seed)

    if world_size > 1:
        import paddle.distributed.fleet as fleet
        strategy = fleet.DistributedStrategy()
        strategy.without_graph_optimization = True
        fleet.init(is_collective=True, strategy=strategy)

    if args.use_synthetic_dataset:
        trainset = datasets.SyntheticDataset(args.num_classes, fp16=args.fp16)
    else:
        trainset = eval("datasets.{}".format(args.dataset_type))(
            root_dir=args.data_dir,
            label_file=args.label_file,
            rank=rank,
            world_size=world_size,
            fp16=args.fp16,
            is_bin=args.is_bin)

    num_image = trainset.total_num_samples
    total_batch_size = args.batch_size * world_size
    steps_per_epoch = num_image // total_batch_size
    if args.train_unit == 'epoch':
        warmup_steps = steps_per_epoch * args.warmup_num
        total_steps = steps_per_epoch * args.train_num
        decay_steps = [x * steps_per_epoch for x in args.decay_boundaries]
        total_epoch = args.train_num
    else:
        warmup_steps = args.warmup_num
        total_steps = args.train_num
        decay_steps = [x for x in args.decay_boundaries]
        total_epoch = (total_steps + steps_per_epoch - 1) // steps_per_epoch

    logging.info('world_size: {}'.format(world_size))
    logging.info('total_batch_size: {}'.format(total_batch_size))
    logging.info('warmup_steps: {}'.format(warmup_steps))
    logging.info('steps_per_epoch: {}'.format(steps_per_epoch))
    logging.info('total_steps: {}'.format(total_steps))
    logging.info('total_epoch: {}'.format(total_epoch))
    logging.info('decay_steps: {}'.format(decay_steps))

    base_lr = total_batch_size * args.lr / 512
    lr_scheduler = paddle.optimizer.lr.PiecewiseDecay(
        boundaries=decay_steps,
        values=[
            base_lr * (args.lr_decay**i) for i in range(len(decay_steps) + 1)
        ])
    if warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
            lr_scheduler, warmup_steps, 0, base_lr)

    train_program = paddle.static.Program()
    test_program = paddle.static.Program()
    startup_program = paddle.static.Program()

    margin_loss_params = eval("losses.{}".format(args.loss))()
    train_model = StaticModel(
        main_program=train_program,
        startup_program=startup_program,
        backbone_class_name=args.backbone,
        embedding_size=args.embedding_size,
        classifier_class_name=args.classifier,
        num_classes=args.num_classes,
        sample_ratio=args.sample_ratio,
        lr_scheduler=lr_scheduler,
        momentum=args.momentum,
        weight_decay=args.weight_decay,
        dropout=args.dropout,
        mode='train',
        fp16=args.fp16,
        fp16_configs={
            'init_loss_scaling': args.init_loss_scaling,
            'incr_every_n_steps': args.incr_every_n_steps,
            'decr_every_n_nan_or_inf': args.decr_every_n_nan_or_inf,
            'incr_ratio': args.incr_ratio,
            'decr_ratio': args.decr_ratio,
            'use_dynamic_loss_scaling': args.use_dynamic_loss_scaling,
            'use_pure_fp16': args.fp16,
            'custom_white_list': args.custom_white_list,
            'custom_black_list': args.custom_black_list,
        },
        margin_loss_params=margin_loss_params,
        data_format=args.data_format,
        lsc_init_from_numpy=args.lsc_init_from_numpy, )

    if rank == 0:
        with open(os.path.join(args.output, 'main_program.txt'), 'w') as f:
            f.write(str(train_program))

    if args.do_validation_while_train:
        test_model = StaticModel(
            main_program=test_program,
            startup_program=startup_program,
            backbone_class_name=args.backbone,
            embedding_size=args.embedding_size,
            dropout=args.dropout,
            mode='test',
            fp16=args.fp16,
            data_format=args.data_format, )

        callback_verification = CallBackVerification(
            args.validation_interval_step, rank, world_size, args.batch_size,
            test_program,
            list(test_model.backbone.input_dict.values()),
            list(test_model.backbone.output_dict.values()), args.val_targets,
            args.data_dir)

    callback_logging = CallBackLogging(args.log_interval_step, rank,
                                       world_size, total_steps,
                                       args.batch_size)
    checkpoint = Checkpoint(
        rank=rank,
        world_size=world_size,
        embedding_size=args.embedding_size,
        num_classes=args.num_classes,
        model_save_dir=os.path.join(args.output, args.backbone),
        checkpoint_dir=args.checkpoint_dir,
        max_num_last_checkpoint=args.max_num_last_checkpoint)

    exe = paddle.static.Executor(place)
    exe.run(startup_program)

    start_epoch = 0
    global_step = 0
    loss_avg = AverageMeter()
    if args.resume:
        extra_info = checkpoint.load(program=train_program, for_train=True)
        start_epoch = extra_info['epoch'] + 1
        lr_state = extra_info['lr_state']
        # there last_epoch means last_step in for PiecewiseDecay
        # since we always use step style for lr_scheduler
        global_step = lr_state['last_epoch']
        train_model.lr_scheduler.set_state_dict(lr_state)

    batch_sampler = eval("paddle.io.{}".format(args.batch_sampler))(
        dataset=trainset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)

    train_loader = paddle.io.DataLoader(
        trainset,
        feed_list=list(train_model.backbone.input_dict.values()),
        places=place,
        return_list=False,
        num_workers=args.num_workers,
        batch_sampler=batch_sampler)

    for epoch in range(start_epoch, total_epoch):
        for step, data in enumerate(train_loader):
            global_step += 1

            loss_v = exe.run(
                train_program,
                feed=data,
                fetch_list=[train_model.classifier.output_dict['loss']],
                use_program_cache=True)

            loss_avg.update(np.array(loss_v)[0], 1)
            lr_value = train_model.optimizer.get_lr()
            callback_logging(global_step, loss_avg, epoch, lr_value)
            if args.do_validation_while_train:
                best_metric = callback_verification(global_step)
                if best_metric is not None and len(best_metric) > 0:
                    for ver_dataset in best_metric:
                        checkpoint.save(
                            train_program,
                            lr_scheduler=train_model.lr_scheduler,
                            epoch=epoch,
                            for_train=True,
                            best_metric=best_metric[ver_dataset])

            train_model.lr_scheduler.step()

            if global_step >= total_steps:
                break
            sys.stdout.flush()

        checkpoint.save(
            train_program,
            lr_scheduler=train_model.lr_scheduler,
            epoch=epoch,
            for_train=True)
Example #18
0
def base(model, device, val_loader, criterion, args, writer, epoch=0):
    """
        Evaluating on unmodified validation set inputs.
    """
    batch_time = AverageMeter("Time", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5],
                             prefix="Test: ")

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, data in enumerate(val_loader):
            images, target = data[0].to(device), data[1].to(device)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if (i + 1) % args.print_freq == 0:
                progress.display(i)

            if writer:
                progress.write_to_tensorboard(writer, "test",
                                              epoch * len(val_loader) + i)

            # write a sample of test images to tensorboard (helpful for debugging)
            if i == 0 and writer:
                writer.add_image(
                    "test-images",
                    torchvision.utils.make_grid(images[0:len(images) // 4]),
                )
        progress.display(i)  # print final results

    return top1.avg, top5.avg
Example #19
0
def validate(val_loader, model, criterion, args, writer, epoch):
    batch_time = AverageMeter("Time", ":6.3f", write_val=False)
    losses = AverageMeter("Loss", ":.3f", write_val=False)
    top1 = AverageMeter("Acc@1", ":6.2f", write_val=False)
    top5 = AverageMeter("Acc@5", ":6.2f", write_val=False)
    progress = ProgressMeter(val_loader.num_batches,
                             [batch_time, losses, top1, top5],
                             args,
                             prefix="Test: ")

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()

        # confusion_matrix = torch.zeros(args.num_cls,args.num_cls)
        for i, data in enumerate(val_loader):
            # images, target = data[0]['data'], data[0]['label'].long().squeeze()
            images, target = data[0].cuda(), data[1].long().squeeze().cuda()

            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            # print(target,torch.mean(images),acc1,acc5,loss,torch.mean(output))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1.item(), images.size(0))
            top5.update(acc5.item(), images.size(0))

            # _, preds = torch.max(output, 1)
            # for t, p in zip(target.view(-1), preds.view(-1)):
            #     confusion_matrix[t.long(), p.long()] += 1

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)

        progress.display(val_loader.num_batches)

        if writer is not None:
            progress.write_to_tensorboard(writer,
                                          prefix="test",
                                          global_step=epoch)

    # torch.save(confusion_matrix,'./conf_mat.pt')
    # print(top1.count)
    return top1.avg, top5.avg
Example #20
0
def dann_train(feature_extractor:FeatureExtractor, 
    domain_adv:DomainAdversarialLoss, 
    src_iter:ForeverDataIterator, 
    tar_iter:ForeverDataIterator, 
    src_val_loader, tar_val_loader,
    args):
    optimizer = Adam(
        itertools.chain(feature_extractor.parameters(), domain_adv.parameters()),
        lr= args.lr,weight_decay=args.weight_decay
    )
    
    npair_loss = NPairsLoss()  # n pair loss

    epoch = args.epoch
    iter_per_epoch = args.iter_per_epoch
    writer = args.writer # Summary Writer
    logger = args.logger
    device = args.device
    w_da = args.w_da
    model_dir = args.model_dir

    # loss
    loss_rec = AverageMeter('tot_loss', tb_tag='Loss/tot', writer=writer)
    loss_lb_rec = AverageMeter('lb_loss', tb_tag='Loss/lb', writer=writer)
    loss_lb_g_rec = AverageMeter('lb_g_loss', tb_tag='Loss/lb_g', writer=writer)
    loss_da_rec = AverageMeter('da_loss', tb_tag='Loss/da', writer=writer)

    # acc
    da_acc_rec = AverageMeter('da_acc', tb_tag='Acc/da', writer=writer)

    n_iter = 0
    best_nmi = 0
    for e_i in range(epoch):
        feature_extractor.train()
        domain_adv.train()
        progress = ProgressMeter(
            iter_per_epoch,
            [loss_lb_g_rec, loss_lb_rec, loss_da_rec,da_acc_rec],
            prefix="Epoch: [{}]".format(e_i),
            logger=logger
        )
        for i in range(iter_per_epoch):
            x_s, l_s = next(src_iter)
            x_t, l_t = next(tar_iter)
            # for obj in [x_s, x_t, l_s, l_t]: # to device
                # obj = obj.to(device)
            
            x_s, l_s, x_t, l_t = x_s.to(device), l_s.to(device), x_t.to(device), l_t.to(device)

            x = torch.cat((x_s, x_t), dim=0)
            f, g = feature_extractor(x)
            f_s, f_t = f.chunk(2, dim=0)
            g_s, g_t = g.chunk(2, dim=0)
            
            # source only part
            loss_s = npair_loss(f_s, l_s) # get n-pair loss on source domain
            loss_s_g = npair_loss(g_s, l_s) # get n-pair loss on source domain
            loss_lb_rec.update(loss_s.item(), x_s.size(0), iter=n_iter)
            loss_lb_g_rec.update(loss_s_g.item(), x_s.size(0), iter=n_iter)
            
            # dann
            # da_loss = domain_adv(f_s,f_t)
            da_loss = domain_adv(g_s,f_t)
            domain_acc = domain_adv.domain_discriminator_accuracy
            loss_da_rec.update(da_loss.item(), f.size(0), iter=n_iter)
            da_acc_rec.update(domain_acc.item(), f.size(0), iter=n_iter)

            loss = 0.5 * (loss_s + loss_s_g) + w_da * da_loss
            # loss = loss_s
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            n_iter += 1
            if i % print_freq == 0:
                progress.display(i)

        if e_i % 5 == 0:
            # logger.info(f"saving embedding in epoch{e_i}")
            # # show embedding
            # show_embedding(backbone, [src_val_loader], tag=f'src_{e_i}', epoch=e_i, writer, device)
            # show_embedding(backbone, [tar_val_loader], tag=f'tar_{e_i}', epoch=e_i, writer, device)
            
            nmi = NMI_eval(feature_extractor, src_val_loader, 5, device, type='src')
            logger.info(f'test on train set nmi: {nmi}')
            nmi = NMI_eval(feature_extractor, tar_val_loader, 5, device, type='tar')
            logger.info(f'test on test set nmi: {nmi}')
            if nmi > best_nmi:
                logger.info(f"save best model to {model_dir}")
                torch.save(feature_extractor.state_dict(), os.path.join(model_dir, 'minst_best_model.pth'))
                best_nmi = nmi
Example #21
0
def train(
    model, device, train_loader, sm_loader, criterion, optimizer, epoch, args, writer
):
    num_class = 10

    sa = np.zeros((num_class, num_class - 1), dtype = np.int32)
    for i in range(sa.shape[0]):
        for j in range(sa.shape[1]):
            if j < i:
                sa[i][j] = j
            else:
                sa[i][j] = j + 1
    sa = torch.LongTensor(sa) 
    batch_size = args.batch_size*2

    schedule_start = 0
    num_steps_per_epoch = len(train_loader)
    eps_scheduler = EpsilonScheduler("linear",
                args.schedule_start,
                ((args.schedule_start + args.schedule_length) - 1) *\
                num_steps_per_epoch, args.starting_epsilon,
                args.epsilon,
                num_steps_per_epoch)

    end_eps = eps_scheduler.get_eps(epoch+1, 0)
    start_eps = eps_scheduler.get_eps(epoch, 0)


    print(
        " ->->->->->->->->->-> One epoch with CROWN-IBP ({:.6f}-{:.6f})"
        " <-<-<-<-<-<-<-<-<-<-".format(start_eps, end_eps)
    )

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    ibp_losses = AverageMeter("IBP_Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    ibp_acc1 = AverageMeter("IBP1", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, ibp_losses, top1, ibp_acc1],
        prefix="Epoch: [{}]".format(epoch),
    )

    model = BoundSequential.convert(model,\
                    {'same-slope': False, 'zero-lb': False,\
                    'one-lb': False}).to(device)

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training data
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print(f"Training images range: {[torch.min(images), torch.max(images)]}")

        output = model(images, method_opt="forward")
        ce = nn.CrossEntropyLoss()(output, target)

        eps = eps_scheduler.get_eps(epoch, i) 
        # generate specifications
        c = torch.eye(num_class).type_as(images)[target].unsqueeze(1) -\
                torch.eye(num_class).type_as(images).unsqueeze(0) 
        # remove specifications to self
        I = (~(target.unsqueeze(1) ==\
            torch.arange(num_class).to(device).type_as(target).unsqueeze(0)))
        c = (c[I].view(images.size(0),num_class-1,num_class)).to(device)
        # scatter matrix to avoid compute margin to self
        sa_labels = sa[target].to(device)
        # storing computed lower bounds after scatter
        lb_s = torch.zeros(images.size(0), num_class).to(device)
        ub_s = torch.zeros(images.size(0), num_class).to(device)

        data_ub = torch.min(images + eps, images.max()).to(device)
        data_lb = torch.max(images - eps, images.min()).to(device)

        ub, ilb, relu_activity, unstable, dead, alive =\
                model(norm=np.inf, x_U=data_ub, x_L=data_lb,\
                eps=eps, C=c, method_opt="interval_range")

        crown_final_beta = 0.
        beta = (args.epsilon - eps * (1.0 - crown_final_beta)) / args.epsilon

        if beta < 1e-5:
            # print("pure naive")
            lb = ilb
        else:
            # print("crown-ibp")
            # get the CROWN bound using interval bounds 
            _, _, clb, bias = model(norm=np.inf, x_U=data_ub,\
                        x_L=data_lb, eps=eps, C=c,\
                        method_opt="backward_range")
            # how much better is crown-ibp better than ibp?
            # diff = (clb - ilb).sum().item()
            lb = clb * beta + ilb * (1 - beta)

        lb = lb_s.scatter(1, sa_labels, lb)
        robust_ce = criterion(-lb, target)

        #print(ce, robust_ce)
        racc = accuracy(-lb, target, topk=(1,))

        loss = robust_ce

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        top1.update(acc1[0].item(), images.size(0))
        losses.update(ce.item(), images.size(0))
        ibp_losses.update(robust_ce.item(), images.size(0))
        ibp_acc1.update(racc[0].item(), images.size(0))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(
                writer, "train", epoch * len(train_loader) + i
            )

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0 : len(images) // 4]),
            )
Example #22
0
def main_worker(args):
    # NEW: equivalent to MPI init.
    print("world size ", os.environ['OMPI_COMM_WORLD_SIZE'])
    print("rank ", os.environ['OMPI_COMM_WORLD_RANK'])
    torch.distributed.init_process_group(
        backend="nccl",
        init_method="env://",
        world_size=int(os.environ['OMPI_COMM_WORLD_SIZE']),
        rank=int(os.environ['OMPI_COMM_WORLD_RANK']))

    # NEW: lookup number of ranks in the job, and our rank
    args.world_size = torch.distributed.get_world_size()
    print("world size ", args.world_size)
    args.rank = torch.distributed.get_rank()
    print("rank ", args.rank)
    ngpus_per_node = torch.cuda.device_count()
    print("ngpus_per_node ", ngpus_per_node)
    local_rank = args.rank % ngpus_per_node
    print("local_rank ", local_rank)

    # NEW: Globalize variables
    global best_acc1
    global best_acc5
    global best_train_acc1
    global best_train_acc5

    #args.gpu = None
    # NEW: Specify gpu
    args.gpu = local_rank
    train, validate, modifier = get_trainer(args)

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # create model and optimizer
    model = get_model(args)

    # NEW: Distributed data
    #if args.distributed:
    args.batch_size = int(args.batch_size / ngpus_per_node)
    args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)

    #model = set_gpu(args, model)
    # NEW: Modified function for loading gpus on multinode setups
    model = lassen_set_gpu(args, model)

    if args.pretrained:
        pretrained(args, model)

    optimizer = get_optimizer(args, model)
    data = get_dataset(args)
    lr_policy = get_policy(args.lr_policy)(optimizer, args)

    if args.label_smoothing is None:
        #criterion = nn.CrossEntropyLoss().cuda()
        # NEW: Specify gpu
        criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    else:
        criterion = LabelSmoothing(smoothing=args.label_smoothing)

    # optionally resume from a checkpoint
    best_acc1 = 0.0
    best_acc5 = 0.0
    best_train_acc1 = 0.0
    best_train_acc5 = 0.0

    if args.resume:
        best_acc1 = resume(args, model, optimizer)

    # Data loading code
    if args.evaluate:
        acc1, acc5 = validate(data.val_loader,
                              model,
                              criterion,
                              args,
                              writer=None,
                              epoch=args.start_epoch)

        return

    # Set up directories
    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args)
        args.ckpt_base_dir = ckpt_base_dir

    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        writer = SummaryWriter(log_dir=log_base_dir)
    else:
        writer = None

    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)

    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        progress_overall = ProgressMeter(
            1, [epoch_time, validation_time, train_time],
            prefix="Overall Timing")

    end_epoch = time.time()
    args.start_epoch = args.start_epoch or 0
    acc1 = None

    # Save the initial state
    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        save_checkpoint(
            {
                "epoch": 0,
                "arch": args.arch,
                "state_dict": model.state_dict(),
                "best_acc1": best_acc1,
                "best_acc5": best_acc5,
                "best_train_acc1": best_train_acc1,
                "best_train_acc5": best_train_acc5,
                "optimizer": optimizer.state_dict(),
                "curr_acc1": acc1 if acc1 else "Not evaluated",
            },
            False,
            filename=ckpt_base_dir / f"initial.state",
            save=False,
        )

    # Start training
    for epoch in range(args.start_epoch, args.epochs):
        # NEW: Distributed data
        #if args.distributed:
        data.train_sampler.set_epoch(epoch)
        data.val_sampler.set_epoch(epoch)

        lr_policy(epoch, iteration=None)
        #modifier(args, epoch, model)

        cur_lr = get_lr(optimizer)

        # train for one epoch
        start_train = time.time()
        train_acc1, train_acc5 = train(data.train_loader,
                                       model,
                                       criterion,
                                       optimizer,
                                       epoch,
                                       args,
                                       writer=writer)
        #train_acc1, train_acc5 = train(
        #    data.train_loader, model, criterion, optimizer, epoch, args, writer=None
        #)
        train_time.update((time.time() - start_train) / 60)

        # evaluate on validation set
        start_validation = time.time()

        # NEW: Only write values to tensorboard for main processor (one with global rank 0)
        if args.rank == 0:
            acc1, acc5 = validate(data.val_loader, model, criterion, args,
                                  writer, epoch)
        else:
            acc1, acc5 = validate(data.val_loader, model, criterion, args,
                                  None, epoch)

        validation_time.update((time.time() - start_validation) / 60)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        best_acc5 = max(acc5, best_acc5)
        best_train_acc1 = max(train_acc1, best_train_acc1)
        best_train_acc5 = max(train_acc5, best_train_acc5)

        save = ((epoch % args.save_every) == 0) and args.save_every > 0

        # NEW: Only do for main processor (one with global rank 0)
        if args.rank == 0:
            if is_best or save or epoch == args.epochs - 1:
                if is_best:
                    print(
                        f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}"
                    )

                save_checkpoint(
                    {
                        "epoch": epoch + 1,
                        "arch": args.arch,
                        "state_dict": model.state_dict(),
                        "best_acc1": best_acc1,
                        "best_acc5": best_acc5,
                        "best_train_acc1": best_train_acc1,
                        "best_train_acc5": best_train_acc5,
                        "optimizer": optimizer.state_dict(),
                        "curr_acc1": acc1,
                        "curr_acc5": acc5,
                    },
                    is_best,
                    filename=ckpt_base_dir / f"epoch_most_recent.state",
                    save=save,
                )
                #filename=ckpt_base_dir / f"epoch_{epoch}.state",

        epoch_time.update((time.time() - end_epoch) / 60)

        # NEW: Only do for main processor (one with global rank 0)
        if args.rank == 0:
            progress_overall.display(epoch)
            progress_overall.write_to_tensorboard(writer,
                                                  prefix="diagnostics",
                                                  global_step=epoch)

            if args.conv_type == "SampleSubnetConv":
                count = 0
                sum_pr = 0.0
                for n, m in model.named_modules():
                    if isinstance(m, SampleSubnetConv):
                        # avg pr across 10 samples
                        pr = 0.0
                        for _ in range(10):
                            pr += ((torch.rand_like(m.clamped_scores) >=
                                    m.clamped_scores).float().mean().item())
                        pr /= 10.0
                        writer.add_scalar("pr/{}".format(n), pr, epoch)
                        sum_pr += pr
                        count += 1

                args.prune_rate = sum_pr / count
                writer.add_scalar("pr/average", args.prune_rate, epoch)

        # NEW: Only do for main processor (one with global rank 0)
        if args.rank == 0:
            writer.add_scalar("test/lr", cur_lr, epoch)

        end_epoch = time.time()

    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        write_result_to_csv(
            best_acc1=best_acc1,
            best_acc5=best_acc5,
            best_train_acc1=best_train_acc1,
            best_train_acc5=best_train_acc5,
            prune_rate=args.prune_rate,
            curr_acc1=acc1,
            curr_acc5=acc5,
            base_config=args.config,
            name=args.name,
        )
Example #23
0
def train(train_loader, model, criterion, optimizer, epoch, args, writer):
    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    top1 = AverageMeter("Acc@1", ":6.2f")
    top5 = AverageMeter("Acc@5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    batch_size = train_loader.batch_size
    num_batches = len(train_loader)
    end = time.time()
    for i, (images, target) in tqdm.tqdm(enumerate(train_loader),
                                         ascii=True,
                                         total=len(train_loader)):
        # measure data loading time
        data_time.update(time.time() - end)

        if args.gpu is not None:
            images = images.cuda(args.gpu, non_blocking=True)

        target = target.cuda(args.gpu, non_blocking=True).long()

        # compute output
        output = model(images)

        loss = criterion(output, target.view(-1))

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1.item(), images.size(0))
        top5.update(acc5.item(), images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)
            progress.write_to_tensorboard(writer,
                                          prefix="train",
                                          global_step=t)

    return top1.avg, top5.avg
Example #24
0
def train(train_loader, model, criterion, optimizer, epoch, cfg, writer):
    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    progress = ProgressMeter(
        train_loader.num_batches,
        [batch_time, data_time, losses],
        cfg,
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    # batch_size = train_loader.batch_size
    num_batches = train_loader.num_batches
    end = time.time()
    batch_size = train_loader.batch_size
    for i, data in enumerate(train_loader):
        imgs1, imgs2, target = data[0][0].cuda(
            non_blocking=True), data[0][1].cuda(
                non_blocking=True), data[1].long().squeeze().cuda(
                    non_blocking=True)
        # measure data loading time
        data_time.update(time.time() - end)

        #compute output
        emb1, emb2 = model(imgs1, imgs2)
        loss = criterion(emb1, emb2)

        losses.update(loss.item(), batch_size)
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % cfg.print_freq == 0 or i == num_batches - 1:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)
            progress.write_to_tensorboard(writer,
                                          prefix="train",
                                          global_step=t)
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch,
          args, writer):
    epsilon = set_epsilon(args, epoch)
    k = args.mixtraink
    alpha = 0.8
    iw = set_interval_weight(args, epoch)

    print(" ->->->->->->->->->-> One epoch with MixTrain{} (SYM {:.3f})"
          " <-<-<-<-<-<-<-<-<-<-".format(k, epsilon))

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    sym_losses = AverageMeter("Sym_Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    sym1 = AverageMeter("Sym1", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, sym_losses, top1, sym1],
        prefix="Epoch: [{}]".format(epoch),
    )

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(
        train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training data
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print(
                f"Training images range: {[torch.min(images), torch.max(images)]}"
            )

        output = model(images)
        ce = nn.CrossEntropyLoss()(output, target)

        if (np.random.uniform() <= alpha):
            r = np.random.randint(low=0, high=images.shape[0], size=k)
            rce, rerr = sym_interval_analyze(
                model,
                epsilon,
                images[r],
                target[r],
                use_cuda=torch.cuda.is_available(),
                parallel=False)

            #print("sym:", rce.item(), ce.item())
            loss = iw * rce + ce
            sym_losses.update(rce.item(), k)
            sym1.update((1 - rerr) * 100., images.size(0))
        else:
            loss = ce

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        top1.update(acc1[0], images.size(0))
        losses.update(ce.item(), images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(writer, "train",
                                          epoch * len(train_loader) + i)

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0:len(images) // 4]),
            )
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch,
          args, writer):
    print(
        " ->->->->->->->->->-> One epoch with Natural training <-<-<-<-<-<-<-<-<-<-"
    )

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch),
    )

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(
        train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print("Pixel range for training images : [{}, {}]".format(
                torch.min(images).data.cpu().numpy(),
                torch.max(images).data.cpu().numpy(),
            ))

        # stability-loss
        if args.dataset == "imagenet":
            std = (torch.tensor(
                [0.229, 0.224,
                 0.225]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)).to(device)
            noise = (torch.randn_like(images) /
                     std).to(device) * args.noise_std
            output = model(images + noise)
            loss = nn.CrossEntropyLoss()(output, target)
        else:
            output = model(images)
            loss_natural = nn.CrossEntropyLoss()(output, target)
            loss_robust = (1.0 / len(images)) * nn.KLDivLoss(
                size_average=False)(
                    F.log_softmax(
                        model(images + torch.randn_like(images).to(device) *
                              args.noise_std),
                        dim=1,
                    ),
                    F.softmax(output, dim=1),
                )
            loss = loss_natural + args.beta * loss_robust

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(writer, "train",
                                          epoch * len(train_loader) + i)

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0:len(images) // 4]),
            )
Example #27
0
def trn(cfg,model):



    cfg.logger.info(cfg)
    if cfg.seed is not None:
        random.seed(cfg.seed)
        torch.manual_seed(cfg.seed)
        torch.cuda.manual_seed(cfg.seed)
        torch.cuda.manual_seed_all(cfg.seed)

    train, validate_knn = get_trainer(cfg)

    if cfg.gpu is not None:
        cfg.logger.info("Use GPU: {} for training".format(cfg.gpu))

    # if cfg.pretrained:
    #     net_utils.load_pretrained(cfg.pretrained,cfg.multigpu[0], model)

    optimizer = get_optimizer(cfg, model)
    cfg.logger.info(f"=> Getting {cfg.set} dataset")

    dataset = getattr(data, cfg.set)(cfg)

    lr_policy = get_policy(cfg.lr_policy)(optimizer, cfg)

    if cfg.arch == 'SimSiam':
        # L = D(p1, z2) / 2 + D(p2, z1) / 2
        base_criterion = lambda bb1_z1_p1_emb, bb2_z2_p2_emb: simsiam.SimSaimLoss(bb1_z1_p1_emb[2], bb2_z2_p2_emb[1]) / 2 +\
                                                 simsiam.SimSaimLoss(bb2_z2_p2_emb[2], bb1_z1_p1_emb[1]) / 2
    elif cfg.arch == 'SimCLR':
        base_criterion = lambda z1,z2 : simclr.NT_XentLoss(z1, z2)
    else:
        raise NotImplemented

    run_base_dir, ckpt_base_dir, log_base_dir = path_utils.get_directories(cfg,cfg.gpu)
    _, zero_gpu_ckpt_base_dir, _ = path_utils.get_directories(cfg, 0)
    # if cfg.resume:
    saved_epochs = sorted(glob.glob(str(zero_gpu_ckpt_base_dir) + '/epoch_*.state'), key=os.path.getmtime)
    # assert len(epochs) < 2, 'Should be only one saved epoch -- the last one'
    if len(saved_epochs) > 0:
        cfg.resume = saved_epochs[-1]
        resume(cfg, model, optimizer)


    cfg.ckpt_base_dir = ckpt_base_dir

    writer = SummaryWriter(log_dir=log_base_dir)
    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)
    progress_overall = ProgressMeter(
        1, [epoch_time, validation_time, train_time], cfg, prefix="Overall Timing"
    )

    end_epoch = time.time()
    cfg.start_epoch = cfg.start_epoch or 0

    start_time = time.time()
    gpu_info = gpu_utils.GPU_Utils(gpu_index=cfg.gpu)

    cfg.logger.info('Start Training: Model conv 1 initialization {}'.format(torch.sum(model.module.backbone.conv1.weight)))
    # Start training

    for n,m in model.module.named_modules():
        if hasattr(m, "weight") and m.weight is not None:
            cfg.logger.info('{} ({}): {}'.format(n,type(m).__name__,m.weight.shape))


    criterion = base_criterion
    cfg.logger.info('Using Vanilla Criterion')
    for epoch in range(cfg.start_epoch, cfg.epochs):
        if cfg.world_size > 1:
            dataset.sampler.set_epoch(epoch)

        lr_policy(epoch, iteration=None)

        cur_lr = net_utils.get_lr(optimizer)

        start_train = time.time()
        train(dataset.trn_loader, model,criterion, optimizer, epoch, cfg, writer=writer)
        train_time.update((time.time() - start_train) / 60)


        if (epoch + 1) % cfg.test_interval == 0:
            if cfg.gpu == cfg.base_gpu:
                # evaluate on validation set
                start_validation = time.time()

                acc = validate_knn(dataset.trn_loader, dataset.val_loader, model.module, cfg, writer, epoch)

                validation_time.update((time.time() - start_validation) / 60)
                csv_utils.write_generic_result_to_csv(path=cfg.exp_dir,name=os.path.basename(cfg.exp_dir[:-1]),
                                                      epoch=epoch,
                                                      knn_acc=acc)

                save = (((epoch+1) % cfg.save_every) == 0) and cfg.save_every > 0
                if save or epoch == cfg.epochs - 1:
                    # if is_best:
                    # print(f"==> best {last_val_acc1:.02f} saving at {ckpt_base_dir / 'model_best.pth'}")

                    net_utils.save_checkpoint(
                        {
                            "epoch": epoch + 1,
                            "arch": cfg.arch,
                            "state_dict": model.state_dict(),
                            "ACC": acc,
                            "optimizer": optimizer.state_dict(),
                        },
                        is_best=False,
                        filename=ckpt_base_dir / f"epoch_{epoch:04d}.state",
                        save=save or epoch == cfg.epochs - 1,
                    )


                    elapsed_time = time.time() - start_time
                    seconds_todo = (cfg.epochs - epoch) * (elapsed_time / cfg.test_interval)
                    estimated_time_complete = timedelta(seconds=int(seconds_todo))
                    start_time = time.time()
                    cfg.logger.info(
                        f"==> ETA: {estimated_time_complete}\tGPU-M: {gpu_info.gpu_mem_usage()}\tGPU-U: {gpu_info.gpu_utilization()}")


                    epoch_time.update((time.time() - end_epoch) / 60)
                    progress_overall.display(epoch)
                    progress_overall.write_to_tensorboard(
                        writer, prefix="diagnostics", global_step=epoch
                    )

                    writer.add_scalar("test/lr", cur_lr, epoch)
                    end_epoch = time.time()


            if cfg.world_size > 1:
                # cfg.logger.info('GPU {} going into the barrier'.format(cfg.gpu))
                dist.barrier()
def evaluate(epoch, data, model, criterion):
    """Main evaluation procedure.
    
    Arguments:
        epoch -- current epoch 
        data -- DataLoader which can provide validation batches
        model -- model to be evaluated
        criterion -- instance of loss function to measure performance
    """
    text_logger = logging.getLogger(__name__)

    model.eval()

    # initialize counters, etc.
    mcd, mcd_count = 0, 0
    cla, cla_count = 0, 0
    eval_losses = {}

    total_loss = AverageMeter('Total Loss', ':.4e')
    mel_pre_loss = AverageMeter('Mel Pre Loss', ':.4e')
    mel_post_loss = AverageMeter('Mel Post Loss', ':.4e')
    lang_class_acc = AverageMeter('Lang Class Acc', ':.4e')
    progress = ProgressMeter(len(data),
                             total_loss,
                             mel_pre_loss,
                             mel_post_loss,
                             lang_class_acc,
                             prefix="Epoch: [{}]".format(epoch),
                             logger=text_logger)

    # loop through epoch batches
    with torch.no_grad():
        for i, batch in enumerate(data):

            # parse batch
            batch = list(map(to_gpu, batch))
            src, src_len, trg_mel, trg_lin, trg_len, stop_trg, spkrs, langs = batch

            # run the model (twice, with and without teacher forcing)
            post_pred, pre_pred, stop_pred, alignment, spkrs_pred, enc_output = model(
                src, src_len, trg_mel, trg_len, spkrs, langs, 1.0)
            post_pred_0, _, stop_pred_0, alignment_0, _, _ = model(
                src, src_len, trg_mel, trg_len, spkrs, langs, 0.0)
            stop_pred_probs = torch.sigmoid(stop_pred_0)

            # evaluate loss function
            post_trg = trg_lin if hp.predict_linear else trg_mel
            classifier = model._reversal_classifier if hp.reversal_classifier else None
            loss, batch_losses = criterion(src_len, trg_len, pre_pred, trg_mel,
                                           post_pred, post_trg, stop_pred,
                                           stop_trg, alignment, spkrs,
                                           spkrs_pred, enc_output, classifier)
            total_loss.update(loss, src.size(0))
            mel_pre_loss.update(batch_losses['mel_pre'], src.size(0))
            mel_post_loss.update(batch_losses['mel_pos'], src.size(0))
            # compute mel cepstral distorsion
            for j, (gen, ref, stop) in enumerate(
                    zip(post_pred_0, trg_mel, stop_pred_probs)):
                stop_idxes = np.where(stop.cpu().numpy() > 0.5)[0]
                stop_idx = min(
                    np.min(stop_idxes) + hp.stop_frames,
                    gen.size()[1]) if len(stop_idxes) > 0 else gen.size()[1]
                gen = gen[:, :stop_idx].data.cpu().numpy()
                ref = ref[:, :trg_len[j]].data.cpu().numpy()
                if hp.normalize_spectrogram:
                    gen = audio.denormalize_spectrogram(
                        gen, not hp.predict_linear)
                    ref = audio.denormalize_spectrogram(ref, True)
                if hp.predict_linear: gen = audio.linear_to_mel(gen)
                mcd = (mcd_count * mcd + audio.mel_cepstral_distorision(
                    gen, ref, 'dtw')) / (mcd_count + 1)
                mcd_count += 1

            # compute adversarial classifier accuracy
            if hp.reversal_classifier:
                input_mask = lengths_to_mask(src_len)
                trg_spkrs = torch.zeros_like(input_mask, dtype=torch.int64)
                for s in range(hp.speaker_number):
                    speaker_mask = (spkrs == s)
                    trg_spkrs[speaker_mask] = s
                matches = (trg_spkrs == torch.argmax(
                    torch.nn.functional.softmax(spkrs_pred, dim=-1), dim=-1))
                matches[~input_mask] = False
                cla = (cla_count * cla + torch.sum(matches).item() /
                       torch.sum(input_mask).item()) / (cla_count + 1)
                cla_count += 1
                lang_class_acc.update(cla, src.size(0))

            # add batch losses to epoch losses
            for k, v in batch_losses.items():
                eval_losses[k] = v + eval_losses[k] if k in eval_losses else v

    # normalize loss per batch
    for k in eval_losses.keys():
        eval_losses[k] /= len(data)

    # log evaluation
    progress.print(i)
    Logger.evaluation(epoch + 1, eval_losses, mcd, src_len, trg_len, src,
                      post_trg, post_pred, post_pred_0, stop_pred_probs,
                      stop_trg, alignment_0, cla)

    return sum(eval_losses.values())
Example #29
0
def trn(cfg, model):

    cfg.logger.info(cfg)
    if cfg.seed is not None:
        random.seed(cfg.seed)
        torch.manual_seed(cfg.seed)
        torch.cuda.manual_seed(cfg.seed)
        torch.cuda.manual_seed_all(cfg.seed)

    train, validate = get_trainer(cfg)

    if cfg.gpu is not None:
        cfg.logger.info("Use GPU: {} for training".format(cfg.gpu))

    linear_classifier_layer = model.module[1]
    optimizer = get_optimizer(cfg, linear_classifier_layer)
    cfg.logger.info(f"=> Getting {cfg.set} dataset")

    dataset = getattr(data, cfg.set)(cfg)

    lr_policy = get_policy(cfg.lr_policy)(optimizer, cfg)

    softmax_criterion = nn.CrossEntropyLoss().cuda()

    criterion = lambda output, target: softmax_criterion(output, target)

    # optionally resume from a checkpoint
    best_val_acc1 = 0.0
    best_val_acc5 = 0.0
    best_train_acc1 = 0.0
    best_train_acc5 = 0.0

    if cfg.resume:
        best_val_acc1 = resume(cfg, model, optimizer)

    run_base_dir, ckpt_base_dir, log_base_dir = path_utils.get_directories(
        cfg, cfg.gpu)
    cfg.ckpt_base_dir = ckpt_base_dir

    writer = SummaryWriter(log_dir=log_base_dir)
    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)
    progress_overall = ProgressMeter(1,
                                     [epoch_time, validation_time, train_time],
                                     cfg,
                                     prefix="Overall Timing")

    end_epoch = time.time()
    cfg.start_epoch = cfg.start_epoch or 0
    last_val_acc1 = None

    start_time = time.time()
    gpu_info = gpu_utils.GPU_Utils(gpu_index=cfg.gpu)

    # Start training
    for epoch in range(cfg.start_epoch, cfg.epochs):
        cfg.logger.info('Model conv 1 {} at epoch {}'.format(
            torch.sum(model.module[0].conv1.weight),
            epoch))  ##  make sure backbone is not updated
        if cfg.world_size > 1:
            dataset.sampler.set_epoch(epoch)
        lr_policy(epoch, iteration=None)

        cur_lr = net_utils.get_lr(optimizer)

        start_train = time.time()
        train_acc1, train_acc5 = train(dataset.trn_loader,
                                       model,
                                       criterion,
                                       optimizer,
                                       epoch,
                                       cfg,
                                       writer=writer)
        train_time.update((time.time() - start_train) / 60)

        if (epoch + 1) % cfg.test_interval == 0:
            if cfg.gpu == cfg.base_gpu:
                # evaluate on validation set
                start_validation = time.time()
                last_val_acc1, last_val_acc5 = validate(
                    dataset.val_loader, model.module, criterion, cfg, writer,
                    epoch)
                validation_time.update((time.time() - start_validation) / 60)

                # remember best acc@1 and save checkpoint
                is_best = last_val_acc1 > best_val_acc1
                best_val_acc1 = max(last_val_acc1, best_val_acc1)
                best_val_acc5 = max(last_val_acc5, best_val_acc5)
                best_train_acc1 = max(train_acc1, best_train_acc1)
                best_train_acc5 = max(train_acc5, best_train_acc5)

                save = (((epoch + 1) % cfg.save_every)
                        == 0) and cfg.save_every > 0
                if save or epoch == cfg.epochs - 1:
                    if is_best:
                        cfg.logger.info(
                            f"==> best {last_val_acc1:.02f} saving at {ckpt_base_dir / 'model_best.pth'}"
                        )

                    net_utils.save_checkpoint(
                        {
                            "epoch": epoch + 1,
                            "arch": cfg.arch,
                            "state_dict": model.state_dict(),
                            "best_acc1": best_val_acc1,
                            "best_acc5": best_val_acc5,
                            "best_train_acc1": best_train_acc1,
                            "best_train_acc5": best_train_acc5,
                            "optimizer": optimizer.state_dict(),
                            "curr_acc1": last_val_acc1,
                            "curr_acc5": last_val_acc5,
                        },
                        is_best,
                        filename=ckpt_base_dir / f"epoch_{epoch}.state",
                        save=save or epoch == cfg.epochs - 1,
                    )

                elapsed_time = time.time() - start_time
                seconds_todo = (cfg.epochs - epoch) * (elapsed_time /
                                                       cfg.test_interval)
                estimated_time_complete = timedelta(seconds=int(seconds_todo))
                start_time = time.time()
                cfg.logger.info(
                    f"==> ETA: {estimated_time_complete}\tGPU-M: {gpu_info.gpu_mem_usage()}\tGPU-U: {gpu_info.gpu_utilization()}"
                )

                epoch_time.update((time.time() - end_epoch) / 60)
                progress_overall.display(epoch)
                progress_overall.write_to_tensorboard(writer,
                                                      prefix="diagnostics",
                                                      global_step=epoch)

                writer.add_scalar("test/lr", cur_lr, epoch)
                end_epoch = time.time()

            if cfg.world_size > 1:
                dist.barrier()
Example #30
0
def train(args):

    rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
    world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))

    gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
    place = paddle.CUDAPlace(gpu_id)

    RELATED_FLAGS_SETTING = {}
    if args.seed == 0:
        RELATED_FLAGS_SETTING['FLAGS_cudnn_deterministic'] = 1
        RELATED_FLAGS_SETTING['FLAGS_benchmark'] = 1
        args.num_workers = 0
    else:
        # args.seed == None or args.seed != 0
        RELATED_FLAGS_SETTING['FLAGS_cudnn_exhaustive_search'] = 1
        RELATED_FLAGS_SETTING['FLAGS_cudnn_batchnorm_spatial_persistent'] = 1
        RELATED_FLAGS_SETTING['FLAGS_max_inplace_grad_add'] = 8
    paddle.fluid.set_flags(RELATED_FLAGS_SETTING)

    if args.seed is not None:
        args.seed = args.seed + rank
        paddle.seed(args.seed)
        np.random.seed(args.seed)
        random.seed(args.seed)

    if world_size > 1:
        import paddle.distributed.fleet as fleet

        strategy = fleet.DistributedStrategy()
        strategy.without_graph_optimization = True
        fleet.init(is_collective=True, strategy=strategy)

    if args.use_synthetic_dataset:
        trainset = datasets.SyntheticDataset(args.num_classes, fp16=args.fp16)
    else:
        trainset = eval("datasets.{}".format(args.dataset_type))(
            root_dir=args.data_dir,
            label_file=args.label_file,
            rank=rank,
            world_size=world_size,
            fp16=args.fp16,
            is_bin=args.is_bin,
            seed=args.seed)

    num_image = trainset.total_num_samples
    total_batch_size = args.batch_size * world_size
    steps_per_epoch = num_image // total_batch_size
    if args.train_unit == 'epoch':
        warmup_steps = steps_per_epoch * args.warmup_num
        total_steps = steps_per_epoch * args.train_num
        decay_steps = [x * steps_per_epoch for x in args.decay_boundaries]
        total_epoch = args.train_num
    else:
        warmup_steps = args.warmup_num
        total_steps = args.train_num
        decay_steps = [x for x in args.decay_boundaries]
        total_epoch = (total_steps + steps_per_epoch - 1) // steps_per_epoch

    logging.info('world_size: {}'.format(world_size))
    logging.info('total_batch_size: {}'.format(total_batch_size))
    logging.info('warmup_steps: {}'.format(warmup_steps))
    logging.info('steps_per_epoch: {}'.format(steps_per_epoch))
    logging.info('total_steps: {}'.format(total_steps))
    logging.info('total_epoch: {}'.format(total_epoch))
    logging.info('decay_steps: {}'.format(decay_steps))

    base_lr = total_batch_size * args.lr / 512
    lr_scheduler = paddle.optimizer.lr.PiecewiseDecay(
        boundaries=decay_steps,
        values=[
            base_lr * (args.lr_decay**i) for i in range(len(decay_steps) + 1)
        ])
    if warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
            lr_scheduler, warmup_steps, 0, base_lr)

    if args.fp16:
        paddle.set_default_dtype("float16")

    margin_loss_params = eval("losses.{}".format(args.loss))()
    backbone = eval("backbones.{}".format(args.backbone))(
        num_features=args.embedding_size,
        dropout=args.dropout,
        data_format=args.data_format)
    classifier = eval("classifiers.{}".format(args.classifier))(
        rank=rank,
        world_size=world_size,
        num_classes=args.num_classes,
        margin1=margin_loss_params.margin1,
        margin2=margin_loss_params.margin2,
        margin3=margin_loss_params.margin3,
        scale=margin_loss_params.scale,
        sample_ratio=args.sample_ratio,
        embedding_size=args.embedding_size,
        fp16=args.fp16,
        numpy_init=args.lsc_init_from_numpy,
    )

    backbone.train()
    classifier.train()

    optimizer = HybridOptimizer(parameters=[{
        'params': backbone.parameters(),
    }, {
        'params': classifier.parameters(),
    }],
                                learning_rate=lr_scheduler,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    if args.do_validation_while_train:
        callback_verification = CallBackVerification(
            args.validation_interval_step,
            rank,
            world_size,
            args.batch_size,
            args.val_targets,
            args.data_dir,
            fp16=args.fp16,
        )

    callback_logging = CallBackLogging(args.log_interval_step, rank,
                                       world_size, total_steps,
                                       args.batch_size)

    checkpoint = Checkpoint(
        rank=rank,
        world_size=world_size,
        embedding_size=args.embedding_size,
        num_classes=args.num_classes,
        model_save_dir=os.path.join(args.output, args.backbone),
        checkpoint_dir=args.checkpoint_dir,
        max_num_last_checkpoint=args.max_num_last_checkpoint)

    start_epoch = 0
    global_step = 0
    loss_avg = AverageMeter()
    if args.resume:
        extra_info = checkpoint.load(backbone,
                                     classifier,
                                     optimizer,
                                     for_train=True)
        start_epoch = extra_info['epoch'] + 1
        lr_state = extra_info['lr_state']
        # there last_epoch means last_step in for PiecewiseDecay
        # since we always use step style for lr_scheduler
        global_step = lr_state['last_epoch']
        lr_scheduler.set_state_dict(lr_state)

    batch_sampler = eval("paddle.io.{}".format(args.batch_sampler))(
        dataset=trainset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)

    train_loader = paddle.io.DataLoader(trainset,
                                        places=place,
                                        num_workers=args.num_workers,
                                        batch_sampler=batch_sampler)

    scaler = HybridParallelGradScaler(
        enable=args.fp16,
        init_loss_scaling=args.init_loss_scaling,
        incr_ratio=args.incr_ratio,
        decr_ratio=args.decr_ratio,
        incr_every_n_steps=args.incr_every_n_steps,
        decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
        use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
        grad_norm_clip=args.grad_norm_clip,
        grad_norm_clip_max=args.grad_norm_clip_max,
        world_size=world_size,
    )
    scaler.sync_params_buffers(backbone)

    for epoch in range(start_epoch, total_epoch):
        for step, (img, label) in enumerate(train_loader):
            global_step += 1

            with paddle.amp.auto_cast(enable=args.fp16):
                features = backbone(img)
                loss_v = classifier(features, label)

            scaler.scale(loss_v).backward()
            classifier.set_attr_for_sparse_momentum()
            scaler.sync_gradient_and_unscale(optimizer)
            scaler.step(optimizer)
            optimizer.clear_grad()

            lr_value = optimizer.get_lr()
            loss_avg.update(loss_v.item(), 1)
            callback_logging(global_step, loss_avg, epoch, lr_value)
            if args.do_validation_while_train:
                best_metric = callback_verification(global_step, backbone)
                if best_metric is not None and len(best_metric) > 0:
                    for ver_dataset in best_metric:
                        checkpoint.save(backbone,
                                        classifier,
                                        optimizer,
                                        epoch=epoch,
                                        for_train=True,
                                        best_metric=best_metric[ver_dataset])
            lr_scheduler.step()

            if global_step >= total_steps:
                break
            sys.stdout.flush()

        checkpoint.save(backbone,
                        classifier,
                        optimizer,
                        epoch=epoch,
                        for_train=True)