Beispiel #1
0
    hc_idxs, uc_idxs = [], []
    total_uc_num = 0  # asm 总共使用的 uc 样本数

    use_asm = True
    for epoch in range(1, args.epochs + 1):
        # 调整 classifier optimizer 的 lr = meta_lr
        adjust_lr(args.lr, optimizer_a, epoch, writer)

        # train on (imb_train_data)
        writer.add_scalar('ASM/batches', len(label_loader), global_step=epoch)

        train_base(label_loader, model, criterion, optimizer_a, epoch,
                   args.print_freq, writer)

        # evaluate on validation set
        prec1 = validate(test_loader, model, criterion, epoch, args.print_freq,
                         writer)

        # remember best prec@1 and save checkpoint
        if prec1 > best_prec1:
            best_prec1, best_epoch = prec1, epoch
            save_model(
                os.path.join(model_save_dir,
                             'rs32_epoch_{}.pth'.format(epoch)), model, epoch,
                best_prec1)
            # 每次 prec 提升后再用 asm 是个好方法,但是太理想
            # 我们恰恰希望能引入 active samples 来带来 prec 提升

        # begin asm stage
        if use_asm and epoch >= args.init_epochs and len(
                unlabel_dataset) > args.uncertain_samples_size:
            # 1.detect on unlabel images
Beispiel #2
0
train_acc_scores = []
val_acc_scores = []

for epoch in range(epochs):
    print(f"Epoch {epoch+1} of {epochs}")
    train_epoch_loss, f1_train, training_accurracy = train_with_scheduler(
        model, train_loader, optimizer, criterion, sgdr_partial, train_data,
        device)

    # train_epoch_loss,training_accurracy = train(
    #     model, train_loader, optimizer, criterion, train_data, device
    # )

    valid_epoch_loss, f1_val, val_accuracy = validate(model, valid_loader,
                                                      criterion, valid_data,
                                                      device)
    train_loss.append(train_epoch_loss)
    valid_loss.append(valid_epoch_loss)

    train_f1_scores.append(f1_train)
    val_f1_scores.append(f1_val)

    train_acc_scores.append(training_accurracy)
    val_acc_scores.append(val_accuracy)

    print(f"Train Loss: {train_epoch_loss:.4f}")
    print(f"Training Acc : {training_accurracy:.4f}")
    print(f"Training f1 : {f1_train:.4f}")

    print(f'Val Loss: {valid_epoch_loss:.4f}')
Beispiel #3
0
)
test_meters = [
    TensorboardLogger(
        log_dir=test_tb_log_dir,
        delimiter="  ",
    ) for test_tb_log_dir in test_tb_log_dirs
]

if cfg.EVALUATE:
    for task_name, testloader, test_meter in zip(task_names, testloaders,
                                                 test_meters):
        logging.info("Evaluating dataset: {}".format(task_name))
        validate(testloader,
                 net,
                 criterion_eval,
                 cfg,
                 test_meter,
                 global_step=0,
                 device=device,
                 local_rank=get_rank())

############## training code #############################
if not cfg.EVALUATE:
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.AMP.ENABLED)
    # start from epoch 0 or last checkpoint epoch
    start_epoch = checkpointer.epoch
    for epoch in range(start_epoch, cfg.OPTIM.EPOCHS):
        # wait for all processes before every epoch
        synchronize()
        logging.info("PROGRESS: {}%".format(
            round(100 * epoch / cfg.OPTIM.EPOCHS, 4)))
        global_step = epoch * len(trainloader)
model = config_file.model
model = model.to(device)
# Optimizer
optimizer = torch.optim.Adam(params=model.parameters(),
                             lr=config.LEARNING_RATE)

# Log metrics with wandb
wandb.watch(model, log="all")
# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(config.TRAIN_EPOCHS):
    engine.train(epoch, tokenizer, model, device, training_loader, optimizer)

# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print(
    'Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe'
)
for epoch in range(config.VAL_EPOCHS):
    predictions, actuals = engine.validate(epoch, tokenizer, model, device,
                                           val_loader)
    final_df = pd.DataFrame({
        'Generated Text': predictions,
        'Actual Text': actuals
    })
    final_df.to_csv(
        '/home/hasan/Desktop/Code to keep on Github/Text Summarization/predictions.csv'
    )
    print('Output Files generated for review')
Beispiel #5
0
# initialize the transform
transform = transform()
# prepare the training and validation data loaders
train_data, valid_data = prepare_dataset(root_path='../input/catsNdogs/')
trainset = LFWDataset(train_data, transform=transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
validset = LFWDataset(valid_data, transform=transform)
validloader = DataLoader(validset, batch_size=batch_size)

train_loss = []
valid_loss = []
for epoch in range(epochs):
    print(f"Epoch {epoch+1} of {epochs}")
    train_epoch_loss = train(model, trainloader, trainset, device, optimizer,
                             criterion)
    valid_epoch_loss, recon_images = validate(model, validloader, validset,
                                              device, criterion)
    train_loss.append(train_epoch_loss)
    valid_loss.append(valid_epoch_loss)
    # save the reconstructed images from the validation loop
    save_reconstructed_images(recon_images, epoch + 1)
    # convert the reconstructed images to PyTorch image grid format
    image_grid = make_grid(recon_images.detach().cpu())
    grid_images.append(image_grid)
    print(f"Train Loss: {train_epoch_loss:.4f}")
    print(f"Val Loss: {valid_epoch_loss:.4f}")

# save the reconstructions as a .gif file
image_to_vid(grid_images)
# save the loss plots to disk
save_loss_plot(train_loss, valid_loss)
print('TRAINING COMPLETE')
Beispiel #6
0
def main():
    # Hyper Parameters setting
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='/mnt/data/linkaiyi/scan/data/f30k_precomp',
                        help='path to datasets')
    parser.add_argument('--path_opt',
                        default='option/FusionNoattn_baseline.yaml',
                        type=str,
                        help='path to a yaml options file')
    parser.add_argument('--data_name',
                        default='flickr30k_splits',
                        help='{coco,f30k}_splits')
    parser.add_argument('--logger_name',
                        default='./log_2',
                        help='Path to save Tensorboard log.')
    parser.add_argument(
        '--vocab_path',
        default=
        '/home/linkaiyi/fusion_wangtan/Fusion_flickr/Fusion_10.28/vocab',
        help='Path to saved vocabulary json files.')
    parser.add_argument(
        '--model_name',
        default='/mnt/data/linkaiyi/mscoco/fusion/Fusion_flic/runs/checkpoint',
        help='Path to save the model.')
    parser.add_argument('--num_epochs',
                        default=120,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--workers',
                        default=2,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--lr_update',
                        default=20,
                        type=int,
                        help='Number of epochs to update the learning rate.')

    opt = parser.parse_args()
    if os.path.isdir(opt.logger_name):
        if click.confirm('Logs directory already exists in {}. Erase?'.format(
                opt.logger_name, default=False)):
            os.system('rm -r ' + opt.logger_name)
    tb_logger.configure(opt.logger_name, flush_secs=5)
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    #########################################################################################
    # Create options
    #########################################################################################

    options = {'logs': {}, 'coco': {}, 'model': {'seq2vec': {}}, 'optim': {}}
    if opt.path_opt is not None:
        with open(opt.path_opt, 'r') as handle:
            options_yaml = yaml.load(handle)
        options = utils.update_values(options, options_yaml)

    vocab = deserialize_vocab(
        os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
    vocab_word = sorted(vocab.word2idx.items(),
                        key=lambda x: x[1],
                        reverse=False)
    vocab_word = [tup[0] for tup in vocab_word]
    opt.vocab_size = len(vocab)

    # Create dataset, model, criterion and optimizer

    train_loader, val_loader = data.get_loaders(opt.data_path, vocab,
                                                opt.batch_size, opt.workers,
                                                opt)
    model = models.factory(options['model'],
                           vocab_word,
                           cuda=True,
                           data_parallel=False)

    criterion = nn.CrossEntropyLoss(weight=torch.Tensor([1, 128])).cuda()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        model.parameters()),
                                 lr=options['optim']['lr'])

    print('Model has {} parameters'.format(utils.params_count(model)))

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            engine.validate(val_loader, model, criterion, optimizer,
                            opt.batch_size)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))
    else:
        start_epoch = 0

    # Train the Model
    best_rsum = 0
    for epoch in range(start_epoch, opt.num_epochs):

        adjust_learning_rate(opt, options, optimizer, epoch)

        # train for one epoch

        engine.train(train_loader,
                     model,
                     criterion,
                     optimizer,
                     epoch,
                     print_freq=10)

        # evaluate on validation set
        rsum = engine.validate(val_loader, model, criterion, optimizer,
                               opt.batch_size)

        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        if not os.path.exists(opt.model_name):
            os.mkdir(opt.model_name)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': 'baseline',
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'options': options,
                'Eiters': model.Eiters,
            },
            is_best,
            filename='checkpoint_{}{}.pth.tar'.format(epoch, best_rsum),
            prefix=opt.model_name + '/')
Beispiel #7
0
def valid_write(path, image):
    cv2.imwrite(path, engine.validate(image))
# train data loader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# validation data loader
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)

# starting the training loop and validation
train_loss = []
valid_loss = []

for epoch in range(epochs):
    print(f"Epoch ", epoch + 1, " of {epochs}")

    train_epoch_loss = train(model, train_loader, optimizer, criterion,
                             train_data, device)
    valid_epoch_loss = validate(model, valid_loader, optimizer, criterion,
                                valid_data, device)

    train_loss.append(train_epoch_loss)
    valid_loss.append(valid_epoch_loss)

    print(f"Train Loss: {train_epoch_loss:.4f}")
    print(f"Val Loss: {valid_epoch_loss:.4f}")

    torch.sabe(
        {
            'epoch': epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': criterion,
        }, '../outputs/model.pth')
Beispiel #9
0
def main_worker(save_dir, args):
    # basic setup
    cudnn.benchmark = True

    if args.log_name is not None:
        log_dir = "runs/%s" % args.log_name
    else:
        log_dir = f"runs/{datetime.datetime.now().strftime('%m-%d-%H-%M-%S')}"

    if args.local_rank == 0:
        logger = SummaryWriter(log_dir)
    else:
        logger = None

    deepspeed.init_distributed(dist_backend='nccl')
    torch.cuda.set_device(args.local_rank)

    model = SetVAE(args)
    parameters = model.parameters()

    n_parameters = sum(p.numel() for p in parameters if p.requires_grad)
    print(f'number of params: {n_parameters}')
    try:
        n_gen_parameters = sum(p.numel() for p in model.init_set.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.pre_decoder.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.decoder.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.post_decoder.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.output.parameters() if p.requires_grad)
        print(f'number of generator params: {n_gen_parameters}')
    except AttributeError:
        pass

    optimizer, criterion = model.make_optimizer(args)

    # initialize datasets and loaders
    train_dataset, val_dataset, train_loader, val_loader = get_datasets(args)

    # initialize the learning rate scheduler
    if args.scheduler == 'exponential':
        assert not (args.warmup_epochs > 0)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(
            optimizer, args.exp_decay)
    elif args.scheduler == 'step':
        assert not (args.warmup_epochs > 0)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=args.epochs // 2,
                                                    gamma=0.1)
    elif args.scheduler == 'linear':

        def lambda_rule(ep):
            lr_w = min(1., ep /
                       args.warmup_epochs) if (args.warmup_epochs > 0) else 1.
            lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float(
                0.5 * args.epochs)
            return lr_l * lr_w

        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                      lr_lambda=lambda_rule)
    elif args.scheduler == 'cosine':
        assert not (args.warmup_epochs > 0)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=args.epochs)
    else:
        # Fake SCHEDULER
        def lambda_rule(ep):
            return 1.0

        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                      lr_lambda=lambda_rule)

    # extract collate_fn
    if args.distributed:
        collate_fn = deepcopy(train_loader.collate_fn)
        model, optimizer, train_loader, scheduler = deepspeed.initialize(
            args=args,
            model=model,
            optimizer=optimizer,
            model_parameters=parameters,
            training_data=train_dataset,
            collate_fn=collate_fn,
            lr_scheduler=scheduler)

    # resume checkpoints
    start_epoch = 0
    if args.resume_checkpoint is None and Path(
            Path(save_dir) / 'checkpoint-latest.pt').exists():
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
        print('Resumed from: ' + args.resume_checkpoint)
    if args.resume_checkpoint is not None:
        if args.distributed:
            if args.resume_optimizer:
                model.module, model.optimizer, model.lr_scheduler, start_epoch = resume(
                    args.resume_checkpoint,
                    model.module,
                    model.optimizer,
                    scheduler=model.lr_scheduler,
                    strict=(not args.resume_non_strict))
            else:
                model.module, _, _, start_epoch = resume(
                    args.resume_checkpoint,
                    model.module,
                    optimizer=None,
                    strict=(not args.resume_non_strict))
        else:
            if args.resume_optimizer:
                model, optimizer, scheduler, start_epoch = resume(
                    args.resume_checkpoint,
                    model,
                    optimizer,
                    scheduler=scheduler,
                    strict=(not args.resume_non_strict))
            else:
                model, _, _, start_epoch = resume(
                    args.resume_checkpoint,
                    model,
                    optimizer=None,
                    strict=(not args.resume_non_strict))

    # save dataset statistics
    if args.local_rank == 0:
        train_dataset.save_statistics(save_dir)
        val_dataset.save_statistics(save_dir)

    # main training loop
    avg_meters = {
        'kl_avg_meter': AverageValueMeter(),
        'l2_avg_meter': AverageValueMeter()
    }

    assert args.distributed

    epoch = start_epoch
    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    for epoch in range(start_epoch, args.epochs):
        if args.local_rank == 0:
            # evaluate on the validation set
            if epoch % args.val_freq == 0 and epoch != 0:
                model.eval()
                with torch.no_grad():
                    val_res = validate(model.module, args, val_loader, epoch,
                                       logger, save_dir)
                    for k, v in val_res.items():
                        v = v.cpu().detach().item()
                        send_slack(f'{k}:{v}, Epoch {epoch - 1}')
                        if logger is not None and v is not None:
                            logger.add_scalar(f'val_sample/{k}', v, epoch - 1)

        # train for one epoch
        train_one_epoch(epoch, model, criterion, optimizer, args, train_loader,
                        avg_meters, logger)

        # Only on HEAD process
        if args.local_rank == 0:
            # save checkpoints
            if (epoch + 1) % args.save_freq == 0:
                if args.eval:
                    validate_reconstruct_l2(epoch, val_loader, model,
                                            criterion, args, logger)
                save(model.module, model.optimizer, model.lr_scheduler,
                     epoch + 1,
                     Path(save_dir) / f'checkpoint-{epoch}.pt')
                save(model.module, model.optimizer, model.lr_scheduler,
                     epoch + 1,
                     Path(save_dir) / 'checkpoint-latest.pt')

            # save visualizations
            if (epoch + 1) % args.viz_freq == 0:
                with torch.no_grad():
                    visualize(model.module, args, val_loader, epoch, logger)

        # adjust the learning rate
        model.lr_scheduler.step()
        if logger is not None and args.local_rank == 0:
            logger.add_scalar('train lr',
                              model.lr_scheduler.get_last_lr()[0], epoch)

    model.eval()
    if args.local_rank == 0:
        with torch.no_grad():
            val_res = validate(model.module, args, val_loader, epoch, logger,
                               save_dir)
            for k, v in val_res.items():
                v = v.cpu().detach().item()
                send_slack(f'{k}:{v}, Epoch {epoch}')
                if logger is not None and v is not None:
                    logger.add_scalar(f'val_sample/{k}', v, epoch)

    if logger is not None:
        logger.flush()
        logger.close()
Beispiel #10
0
                                        num_workers=8)

# Initialize the network, loss and optimizer
if args.modality == 'PAN':
    model = modelDef.panNet()
elif args.modality == 'MS':
    model = modelDef.msNet()
elif args.modality == 'HS1':
    model = modelDef.HS1(dataset.data.shape[1])
elif args.modality == 'HS2':
    model = modelDef.HS2(dataset.data.shape[1])

model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=args.learningRate)

# Training and Inference
print('Training (' + args.modality + ') Network...')
bestAcc = 0
for epoch in tqdm(range(args.nEpochs)):
    engine.train(model, trainLoader, optimizer, criterion, device, plotter,
                 epoch)
    accuracy, classWiseAcc = engine.validate(model, valLoader, criterion,
                                             device, plotter, epoch)
    if accuracy > bestAcc:
        utils.save_model({'stateDict': model.state_dict()}, args.name,
                         args.modality + 'net_instance_' + str(args.instance) +
                         '.pth.tar')
        bestAcc = accuracy
print('Best Accuracy: ' + str(bestAcc))
# print('Classwise Accuracy: '+str(classWiseAcc))