Esempio n. 1
0
    if args.use_weights:
        softmax_proteins, *_ = next(iter(train_loader))
        softmax_proteins = softmax_proteins[:4]
    else:
        softmax_proteins = next(iter(train_loader))[:4]
    softmax_name = args.results_dir / Path("softmax.png")

    if args.anneal_learning_rates and args.plot_learning_rates:
        learning_rates = []
        learning_rates_name = args.results_dir / Path("learning_rates.png")

    try:
        for epoch in range(1, args.epochs + 1):
            start_time = time.time()
            train_loss, train_metrics = train_epoch(
                epoch, model, optimizer, train_loader, args.log_interval,
                args.clip_grad_norm, args.clip_grad_value, scheduler)

            if args.val_ratio > 0:
                val_loss, val_metrics = validate(epoch, model, val_loader)
                loss_str = "Validation"
                loss_value_str = f"{val_loss:.5f}"
                val_str = f"{loss_str} loss: {loss_value_str} "
                improved = val_loss < best_loss

            else:
                loss_str = "Training"
                loss_value_str = f"{train_loss:.5f}"
                val_str = ""
                improved = train_loss < best_loss
Esempio n. 2
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        #        opt.device = torch.device(f'cuda:{index}')
        opt.device = torch.device('cuda:{}'.format(index))

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes, opt.strg)

    if opt.strg:
        model = STRG(model, nclass=opt.n_classes, nrois=opt.nrois)
        rpn = RPN(nrois=opt.nrois)
        rpn = make_data_parallel(rpn, opt.distributed, opt.device)
    else:
        rpn = None

    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)

    model = make_data_parallel(model, opt.distributed, opt.device)

    #    if opt.pretrain_path:
    #        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    #    else:
    parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        #from torch.utils.tensorboard import SummaryWriter
        from tensorboardX import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    if opt.wandb:
        name = str(opt.result_path)
        wandb.init(
            project='strg',
            name=name,
            config=opt,
            dir=name,
            #            resume=str(opt.resume_path) != '',
            sync_tensorboard=True)

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i,
                        train_loader,
                        model,
                        criterion,
                        optimizer,
                        opt.device,
                        current_lr,
                        train_logger,
                        train_batch_logger,
                        tb_writer,
                        opt.distributed,
                        rpn=rpn,
                        det_interval=opt.det_interval,
                        nrois=opt.nrois)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i,
                                      val_loader,
                                      model,
                                      criterion,
                                      opt.device,
                                      val_logger,
                                      tb_writer,
                                      opt.distributed,
                                      rpn=rpn,
                                      det_interval=opt.det_interval,
                                      nrois=opt.nrois)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
Esempio n. 3
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)

    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.resume_path is not None:
        if not opt.no_train:
            opt.begin_epoch, model, optimizer, scheduler = resume(
                opt.resume_path, opt.arch, opt.begin_epoch, model, optimizer,
                scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
        else:
            opt.begin_epoch, model, _, _ = resume(opt.resume_path, opt.arch,
                                                  opt.begin_epoch, model)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
Esempio n. 4
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    print('after generating model:', model.fc.in_features, ':',
          model.fc.out_features)
    print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape)

    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    print('after resume model:', model.fc.in_features, ':',
          model.fc.out_features)
    print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape)
    # summary(model, input_size=(3, 112, 112))
    #    if opt.pretrain_path:
    #        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
    #                                      opt.n_finetune_classes)

    print('after pretrained  model:', model.fc.in_features, ':',
          model.fc.out_features)
    print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape)
    print(torch_summarize(model))
    # parameters = model.parameters()
    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         print(name, param.data)
    #    summary(model, (3, 112, 112))
    #    return

    #    print('model parameters shape', parameters.shape)

    (train_loader, train_sampler, train_logger, train_batch_logger, optimizer,
     scheduler) = get_train_utils(opt, model.parameters())

    for i, (inputs, targets) in enumerate(train_loader):
        print('input shape:', inputs.shape)
        print('targets shape:', targets.shape)
        outputs = model(inputs)
        print("output shape", outputs.shape)
        model_arch = make_dot(outputs, params=dict(model.named_parameters()))
        print(model_arch)
        model_arch.render("/apollo/data/model.png", format="png")
        # Source(model_arch).render('/apollo/data/model.png')
        # print("generating /apollo/data/model.png")
        break

    # make_dot(yhat, params=dict(list(model.named_parameters()))).render("rnn_torchviz", format="png")

    return

    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
Esempio n. 5
0
def main():

    # Read data
    df = pd.read_csv("../data/thedeep.data.txt",
                     sep=",",
                     header=1,
                     names=['sentence_id', 'text', 'label'])

    # Load the BERT tokenizer
    print('Loading BERT Tokenizer....')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Use model selection library to split data: training data and validating data
    training_inputs, validation_inputs = train_test_split(df,
                                                          random_state=2018,
                                                          test_size=0.3)
    batch_size = 32
    max_length = 80
    Epochs = 1

    train_dataset = dataset.BertClassificationDataset(
        text=training_inputs.text.values,
        label=training_inputs.label.values,
        tokenizer=tokenizer,
        max_length=max_length)
    valid_dataset = dataset.BertClassificationDataset(
        text=validation_inputs.text.values,
        label=validation_inputs.label.values,
        tokenizer=tokenizer,
        max_length=max_length)

    # Create a dataloader for training and validation data
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=batch_size)
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=RandomSampler(valid_dataset),
                                  batch_size=batch_size)

    # Create a instance of bert model, optimizer and scheduler
    device = torch.device("cuda")
    print('Loading BERT Model....')
    model = BertTextClassification('bert-base-uncased')
    model = model.to(device)

    param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.1
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]

    num_train_steps = int(len(train_dataset) * Epochs)
    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    loss_fn = nn.CrossEntropyLoss().to(device)

    # Training and evaluating the model on the validation dataset
    training_stats = []
    best_accuracy = 0

    for epoch in range(Epochs):

        print(f'Epoch {epoch + 1}/{Epochs}')
        print('-' * 10)

        train_accuracy, train_loss = training.train_epoch(
            model, train_dataloader, loss_fn, optimizer, device, scheduler,
            training_inputs)

        print(f'Train loss {train_loss} accuracy {train_accuracy}')

        val_accuracy, val_loss = training.eval_model(model, valid_dataloader,
                                                     loss_fn, device,
                                                     validation_inputs)

        print(f'Val   loss {val_loss} accuracy {val_accuracy}')
        print()

        # Record all statistics from this epoch.
        training_stats.append({
            'epoch': epoch + 1,
            'Training Loss': train_loss,
            'Training Accuracy': train_accuracy,
            'Valid. Loss': val_loss,
            'Valid. Accur.': val_accuracy,
        })

        if val_accuracy > best_accuracy:
            torch.save(model.state_dict(), 'best_model.bin')
            best_accuracy = val_accuracy
Esempio n. 6
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)
    if opt.dropout:
        n_classes = opt.n_classes
        if opt.pretrain_path is not None:
            n_classes = opt.n_finetune_classes
        model = replace_fc_layer(model=model,
                                 dropout_factor=opt.dropout_factor,
                                 n_classes=n_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)

    model = make_data_parallel(model, opt.distributed, opt.device)
    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    if opt.labelsmoothing:
        criterion = LabelSmoothingCrossEntropy().to(opt.device)
    else:
        criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    if opt.lr_finder and not opt.no_train and not opt.no_val:
        print(
            "Performing Learning Rate Search\nWith Leslie Smith's approach...")
        lr_finder = LRFinder(model, optimizer, criterion, device=opt.device)
        lr_finder.range_test(train_loader,
                             val_loader=val_loader,
                             start_lr=opt.learning_rate,
                             end_lr=opt.lrf_end_lr,
                             num_iter=opt.lrf_num_it,
                             step_mode=opt.lrf_mode)
        lr_finder.plot(log_lr=False)
        with (opt.result_path / 'lr_search.json').open('w') as results_file:
            json.dump(lr_finder.history, results_file, default=json_serial)
        lr_finder.reset()
        return

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            #current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, train_logger, train_batch_logger,
                        scheduler, opt.lr_scheduler, tb_writer,
                        opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)
        elif not opt.no_train and opt.lr_scheduler == 'cosineannealing':
            scheduler.step()

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
Esempio n. 7
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        # opt.n_threads = int(
        #     (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0


    model = genarate_model(opt)     
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    if opt.distributed:
        model = make_data_parallel(model,opt.device)
    else:
        model.to(opt.device)
        # model = nn.DataParallel(model).cuda()

    print('Total params: %.2fM' % (sum(p.numel()
                                       for p in model.parameters()) / 1000000.0))
    if opt.is_master_node:
        print(model)
    parameters = model.parameters()
    criterion = CrossEntropyLoss().to(opt.device)

    (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)

    val_loader, val_logger = get_val_utils(opt)

    if not opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    print('数据加载完毕')
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
                # train_sampler2.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, opt.is_master_node, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i,model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger,opt.is_master_node, tb_writer,
                                      opt.distributed)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)
Esempio n. 8
0
def main(opt):
    place = fluid.CPUPlace() if opt.no_cuda else fluid.CUDAPlace(0)
    with fluid.dygraph.guard(place):
        print(place)
        random.seed(opt.manual_seed)
        np.random.seed(opt.manual_seed)
        prog = fluid.default_main_program()
        prog.global_seed(opt.manual_seed)
        os.environ['PYTHONHASHSEED'] = str(opt.manual_seed)

        model = generate_model(opt)
        if opt.pretrain_path:
            model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                          opt.n_finetune_classes)

        if opt.resume_path is not None:
            model = resume_model(opt.resume_path, model)

        if opt.pretrain_path:
            parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
        else:
            parameters = model.parameters()

        if not opt.no_train:
            (train_loader, train_logger, train_batch_logger, optimizer,
             scheduler) = get_train_utils(opt, parameters)
            if opt.resume_path is not None:
                opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                    opt.resume_path, optimizer, scheduler)
                if opt.overwrite_milestones:
                    scheduler.milestones = opt.multistep_milestones

        if not opt.no_val:
            val_loader, val_logger = get_val_utils(opt)

        best_acc = 0.88
        for epoch in range(opt.begin_epoch, opt.n_epochs + 1):
            if not opt.no_train:
                train_epoch(epoch, train_loader, model, optimizer, scheduler,
                            train_logger, train_batch_logger)

                if epoch % opt.checkpoint == 0:
                    save_file_path = str(
                        opt.result_path) + 'save_{}_{}_{}'.format(
                            epoch, opt.train_crop, opt.batch_size)
                    save_checkpoint(save_file_path, model, optimizer)

            if not opt.no_val:
                prev_val_loss, val_acc = val_epoch(epoch, val_loader, model,
                                                   val_logger)

            if not opt.no_train and opt.lr_scheduler == 'multistep':
                scheduler.epoch()
            elif not opt.no_train and opt.lr_scheduler == 'plateau':
                scheduler.step(prev_val_loss)

            if not opt.no_val:
                if val_acc > best_acc:
                    best_acc = val_acc
                    save_file_path = str(
                        opt.result_path) + 'save_{}_{}_best_val_acc'.format(
                            epoch, opt.train_crop)
                    save_checkpoint(save_file_path, model, optimizer)

            if not opt.no_train:
                current_lr = optimizer.current_step_lr()
                print("current val_loss is %s, current lr is %s" %
                      (prev_val_loss.numpy()[0], current_lr))

        if opt.inference:
            inference_loader, inference_class_names = get_inference_utils(opt)
            inference_result_path = opt.result_path / '{}_{}.json'.format(
                opt.inference_subset, opt.train_crop)

            inference.inference(inference_loader, model, inference_result_path,
                                inference_class_names,
                                opt.inference_no_average, opt.output_topk)
Esempio n. 9
0
def main():
    if not os.path.exists(DATASET_PATH):
        download_dataset()
    df = pd.read_csv(DATASET_PATH)

    tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

    df['overall'] -= 1
    df_train, df_test = train_test_split(df,
                                         test_size=0.25,
                                         random_state=RANDOM_SEED,
                                         stratify=df[['overall']])
    train_data_loader = create_data_loader(df_train, tokenizer, TOKEN_MAX_LEN,
                                           BATCH_SIZE)
    test_data_loader = create_data_loader(df_test, tokenizer, TOKEN_MAX_LEN,
                                          BATCH_SIZE)

    model = SentimentClassifier(len(class_names), PRE_TRAINED_MODEL_NAME)
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_data_loader) * EPOCHS

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    # class weights for loss function for imbalanced problem
    class_weights = compute_class_weight(classes=[0, 1, 2, 3, 4],
                                         y=df_train['overall'],
                                         class_weight='balanced')
    class_weights = torch.FloatTensor(class_weights).to(device)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights).to(device)

    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)

        train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn,
                                            optimizer, device, scheduler,
                                            len(df_train))

        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(model, test_data_loader, loss_fn,
                                       device, len(df_test))

        print(f'Val loss {val_loss} accuracy {val_acc}')
        print()

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_accuracy = val_acc

    plot_history(history)

    test_acc, _ = eval_model(model, test_data_loader, loss_fn, device,
                             len(df_test))

    y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
        model, test_data_loader, device)

    os.makedirs("model", exist_ok=True)
    torch.save(model.state_dict(), "model/model.pt")

    show_metrics(y_pred, y_pred_probs, y_test)

    preprocessing = Preprocessing()
    predict_single_review("I like it, perfect", preprocessing, tokenizer,
                          model, device)
Esempio n. 10
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0
    model = generate_model(opt)
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
        print('resume model from ', opt.resume_path)

    print('model after resume:', model)

    # save model to current running id
    # mlflow.pytorch.log_model(model, "action_model")
    # model_path = mlflow.get_artifact_uri("action_model")
    # print('mlflow action model path: ', model_path)
    # model = mlflow.pytorch.load_model(model_path)
    if opt.ml_tag_name != '' and opt.ml_tag_value != '':
        # mlflow.set_tag("test_tag", 'inference_test')
        mlflow.set_tag(opt.ml_tag_name, opt.ml_tag_value)

    # load from previous published model version
    if opt.ml_model_name != '' and opt.ml_model_version != '':
        # model_name = 'action_model'
        # model_version = '1'
        model_uri = "models:/{}/{}".format(opt.ml_model_name,
                                           opt.ml_model_version)
        model = mlflow.pytorch.load_model(model_uri)

    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)
                if opt.ml_model_name != '':
                    mlflow.pytorch.log_model(model, opt.ml_model_name)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed)
            mlflow.log_metric("loss", prev_val_loss)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
Esempio n. 11
0
def main(args):

    set_seed(SEED)

    train_transforms, test_transforms = get_transforms(args.dataset)
    print(f"Data transformations:\n{train_transforms}\n")

    # Get the dataloaders
    train_loader, test_loader = get_dataloaders(args.dataset, args.batch_size,
                                                args.workers, train_transforms,
                                                test_transforms)

    # Architecture
    if args.dataset == 'mnist':
        in_channels = 1
    else:
        raise NotImplementedError()
    if args.activation == 'relu':
        activation = nn.ReLU(inplace=True)
    else:
        raise NotImplementedError()
    if args.pooling == 'max':
        pooling = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
    else:
        raise NotImplementedError()
    drop_rate = args.drop_rate

    # Build model
    model = LeNet5(in_channels, activation, pooling, drop_rate)
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpu)
        model = model.cuda()
    # Weight normal initialization
    if args.init_weights:
        model.apply(normal_initialization)

    start_epoch = 0
    if args.resume is not None:
        model, optimizer, start_epoch = load_training_state(
            model, optimizer, args.resume)

    # Loss function & optimizer
    if args.criterion == 'ce':
        criterion = nn.CrossEntropyLoss()
    else:
        raise NotImplementedError()
    if args.optimizer == 'sgd':
        # Issue
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay,
                              nesterov=args.nesterov)
    elif args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.weight_decay)
    else:
        raise NotImplementedError()

    scheduler = ReduceLROnPlateau(optimizer,
                                  factor=0.5,
                                  patience=0,
                                  threshold=1e-2,
                                  verbose=True)

    # Output folder
    output_folder = os.path.join(args.output_folder, args.training_name)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    log_path = os.path.join(args.output_folder, 'logs', args.training_name)
    if os.path.exists(log_path):
        rmtree(log_path)
    logger = SummaryWriter(log_path)

    # Train
    best_loss = math.inf
    mb = master_bar(range(args.nb_epochs))
    for epoch_idx in mb:
        # Training
        train_epoch(model,
                    train_loader,
                    optimizer,
                    criterion,
                    mb,
                    tb_logger=logger,
                    epoch=start_epoch + epoch_idx)

        # Evaluation
        val_loss, accuracy = evaluate(model, test_loader, criterion)

        mb.first_bar.comment = f"Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs}"
        mb.write(
            f'Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs} - Validation loss: {val_loss:.4} (Acc@1: {accuracy:.2%})'
        )

        # State saving
        if val_loss < best_loss:
            print(
                f"Validation loss decreased {best_loss:.4} --> {val_loss:.4}: saving state..."
            )
            best_loss = val_loss
            torch.save(
                dict(epoch=start_epoch + epoch_idx,
                     model_state_dict=model.state_dict(),
                     optimizer_state_dict=optimizer.state_dict(),
                     val_loss=val_loss),
                os.path.join(output_folder, "training_state.pth"))

        if logger is not None:
            current_iter = (start_epoch + epoch_idx + 1) * len(train_loader)
            logger.add_scalar(f"Validation loss", val_loss, current_iter)
            logger.add_scalar(f"Error rate", 1 - accuracy, current_iter)
            logger.flush()
        scheduler.step(val_loss)
Esempio n. 12
0
def run(*configs, group=None):
    config = configuration.load(*configs)
    if config.group:
        config.group = config.data_source + '-' + config.group
    else:
        config.group = config.data_source
    if group:
        config.group = config.group + "-" + str(group)
    if config.from_scratch:
        config.group = 'scratch-' + config.group
        config.name = 'scratch-' + config.name
    if config.log:
        wandb.init(project='explainable-asag',
                   group=config.group,
                   name=config.name,
                   config=config)
        config = wandb.config

    model = transformers.AutoModelForSequenceClassification.from_pretrained(
        config.model_name, num_labels=config.num_labels)
    if config.token_types:
        embedding_size = model.config.__dict__.get('embedding_size',
                                                   model.config.hidden_size)
        update_token_type_embeddings(model, embedding_size,
                                     model.config.initializer_range)
    if config.from_scratch:
        model.init_weights()

    cuda = torch.cuda.is_available()
    if cuda:
        model.cuda()

    train_dataloader = dataset.dataloader(val_mode=False,
                                          data_file=config.train_data,
                                          data_source=config.data_source,
                                          vocab_file=config.model_name,
                                          num_labels=config.num_labels,
                                          train_percent=config.train_percent,
                                          batch_size=config.batch_size,
                                          drop_last=config.drop_last,
                                          num_workers=config.num_workers)
    val_dataloader = dataset.dataloader(val_mode=True,
                                        data_file=config.val_data,
                                        data_source=config.data_source,
                                        vocab_file=config.model_name,
                                        num_labels=config.num_labels,
                                        train_percent=config.val_percent,
                                        batch_size=config.batch_size,
                                        drop_last=config.drop_last,
                                        num_workers=config.num_workers)

    optimizer = torch.optim.__dict__[config.optimizer](
        model.parameters(), lr=config.learn_rate, **config.optimizer_kwargs)

    # Hack to get any scheduler we want. transformers.get_scheduler does not implement e.g. linear_with_warmup.
    get_scheduler = {
        'linear_with_warmup':
        transformers.get_linear_schedule_with_warmup,
        'cosine_with_warmup':
        transformers.get_cosine_schedule_with_warmup,
        'constant_with_warmup':
        transformers.get_constant_schedule_with_warmup,
        'cosine_with_hard_restarts_with_warmup':
        transformers.get_cosine_with_hard_restarts_schedule_with_warmup
    }
    lr_scheduler = get_scheduler[config.scheduler](optimizer,
                                                   *config.scheduler_args,
                                                   **config.scheduler_kwargs)

    best_f1 = 0.0
    patience = 0
    epoch = 0
    log_line = ''
    try:
        #while lr_scheduler.last_epoch <= total_steps:
        while epoch < config.max_epochs:
            epoch += 1
            av_epoch_loss = training.train_epoch(
                train_dataloader,
                model,
                optimizer,
                lr_scheduler,
                config.num_labels,
                cuda,
                log=config.log,
                token_types=config.token_types)
            #tidy stuff up every epoch
            gc.collect()
            torch.cuda.empty_cache()
            metrics_weighted, metrics_macro = training.val_loop(
                model, val_dataloader, cuda, token_types=config.token_types)
            p, r, f1, val_acc = metrics_weighted
            p_m, r_m, f1_m, val_acc_m = metrics_macro
            log_line = f'model: {config.model_name} | epoch: {epoch} | av_epoch_loss {av_epoch_loss:.5f} | f1: {f1:.5f} | accuracy: {val_acc:.5f} \n'
            print(log_line[:-1])
            if config.log:
                wandb.log({
                    'precision': p,
                    'recall': r,
                    'f1': f1,
                    'accuracy': val_acc,
                    'av_epoch_loss': av_epoch_loss
                })
                wandb.log({
                    'precision-macro': p_m,
                    'recall-macro': r_m,
                    'f1-macro': f1_m,
                    'accuracy-macro': val_acc_m
                })
            if f1 > best_f1:
                if config.log:
                    this_model = os.path.join(wandb.run.dir, 'best_f1.pt')
                    print("saving to: ", this_model)
                    torch.save([model.state_dict(), config.__dict__],
                               this_model)
                    wandb.save('*.pt')
                best_f1 = f1
                patience = 0  #max((0, patience-1))
            elif config.max_patience:
                patience += 1
                if patience >= config.max_patience:
                    break
        # Move stuff off the gpu
        model.cpu()
        #This is for sure a kinda dumb way of doing it, but the least mentally taxing right now
        optimizer = torch.optim.__dict__[config.optimizer](
            model.parameters(), lr=config.learn_rate)
        gc.collect()
        torch.cuda.empty_cache()
        #return model   #Gives Error

    except KeyboardInterrupt:
        if config.log:
            wandb.save('*.pt')
        #Move stuff off the gpu
        model.cpu()
        optimizer = torch.optim.__dict__[config.optimizer](
            model.parameters(), lr=config.learn_rate)
        gc.collect()
        torch.cuda.empty_cache()
Esempio n. 13
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    #criterion = CrossEntropyLoss().to(opt.device)
    # ADDED for 231n
    criterion = FocalLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    conf_mtx_dict = {}  # ADDED for CS231n

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed,
                                      conf_mtx_dict)  # ADDED for CS231n

        # ADDED for 231n - uncomment if using cross entropy loss
        #if not opt.no_train and opt.lr_scheduler == 'multistep':
        #    scheduler.step()
        #elif not opt.no_train and opt.lr_scheduler == 'plateau':
        #    scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)

    # ADDED for CS231n
    conf_mtx_file = csv.writer(open("conf_mtxs.csv", "w+"))
    for key, val in conf_mtx_dict.items():
        conf_mtx_file.writerow([key, val])
Esempio n. 14
0
        if args.visualize_interval != "never":
            plot_data(args.results_dir /
                      Path(f"epoch_0_val_loss_inf.png") if save else None,
                      args.figure_type,
                      model,
                      all_data,
                      args.batch_size,
                      show=show,
                      only_subset_labels=subset_labels)
        for epoch in range(1, args.epochs + 1):
            start_time = time.time()
            train_loss, train_metrics = train_epoch(
                epoch=epoch,
                model=model,
                optimizer=optimizer,
                train_loader=train_loader,
                log_interval=args.log_interval,
                clip_grad_norm=args.clip_grad_norm,
                clip_grad_value=args.clip_grad_value,
                random_weighted_sampling=args.random_weighted_sampling)

            if args.val_ratio > 0:
                val_loss, val_metrics = validate(epoch, model, val_loader)
                loss_str = "Validation"
                loss_value_str = f"{val_loss:.5f}"
                val_str = f"{loss_str} loss: {loss_value_str} "
                val_nll_losses.append(val_metrics["nll_loss"])
                val_kld_losses.append(val_metrics["kld_loss"])
                val_param_klds.append(val_metrics["param_kld"])
                val_total_losses.append(val_loss)
Esempio n. 15
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    if opt.inference:
        model = generate_model(opt)
    else:
        model = generate_model(opt, use_features=True)

    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path,
                                      opt.n_finetune_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    #####################################################################################
    ### here add a classifier to predict videos and audios
    if opt.inference is False:
        ### define loss
        criterion = CrossEntropyLoss().to(opt.device)

        if opt.use_audio or opt.use_image:
            criterion_jsd = JSDLoss(weight=0.5)

        #################################################################################
        if opt.use_audio:
            ### define loss
            criterion_ct_av = NCELoss(temperature=0.5)
            ### audio teacher model
            feature_dim = 512 * 2
            if opt.pretrain_path is not None:
                joint_prediction_aud = generate_prediction(
                    feature_dim, opt.n_finetune_classes, normalization=True)
            else:
                joint_prediction_aud = generate_prediction(feature_dim,
                                                           opt.n_classes,
                                                           normalization=True)
            if opt.resume_path is not None:
                aux_checkpoint = Path(
                    os.path.join(str(opt.resume_path.parent),
                                 str(opt.resume_path.name[:-4] +
                                     '_audio.pth')))
                joint_prediction_aud = resume_model(aux_checkpoint, opt.arch,
                                                    joint_prediction_aud)

            joint_prediction_aud = make_data_parallel(joint_prediction_aud,
                                                      opt.distributed,
                                                      opt.device)
            aud_para = joint_prediction_aud.parameters()
            joint_prediction_aud.cuda()
        else:
            aud_para = None

        #################################################################################
        if opt.use_image:
            ### define loss
            criterion_ct_iv = NCELoss(temperature=0.1)
            ### image teacher model
            image_model = torchvision.models.resnet34(pretrained=True)
            # remove the fc layers (only use the image features)
            image_model = torch.nn.Sequential(
                *list(image_model.children())[:-1])
            image_model = make_data_parallel(image_model, opt.distributed,
                                             opt.device)
            feature_dim = 512 * 2
            if opt.pretrain_path is not None:
                joint_prediction_img = generate_prediction(
                    feature_dim, opt.n_finetune_classes, normalization=True)
            else:
                joint_prediction_img = generate_prediction(feature_dim,
                                                           opt.n_classes,
                                                           normalization=True)
            if opt.resume_path is not None:
                aux_checkpoint = Path(
                    os.path.join(str(opt.resume_path.parent),
                                 str(opt.resume_path.name[:-4] +
                                     '_image.pth')))
                joint_prediction_img = resume_model(aux_checkpoint, opt.arch,
                                                    joint_prediction_img)

            joint_prediction_img = make_data_parallel(joint_prediction_img,
                                                      opt.distributed,
                                                      opt.device)
            img_para = joint_prediction_img.parameters()
            joint_prediction_img.cuda()
        else:
            img_para = None

        #################################################################################
        (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, optimizer_av, optimizer_iv, scheduler) = \
            get_train_utils(opt, model_parameters=parameters, av_parameters=aud_para, iv_parameters=img_para)

        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones

    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    pre_val_acc = 0.0
    if opt.image_size > opt.sample_size:
        image_size = opt.image_size
    else:
        image_size = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            if optimizer_av is None and optimizer_iv is None:
                train_epoch(epoch=i,
                            data_loader=train_loader,
                            model=model,
                            criterion=criterion,
                            optimizer=optimizer,
                            device=opt.device,
                            current_lr=current_lr,
                            epoch_logger=train_logger,
                            batch_logger=train_batch_logger,
                            tb_writer=tb_writer,
                            distributed=opt.distributed)
            elif optimizer_av is not None and optimizer_iv is None:
                train_a_epoch(epoch=i,
                              data_loader=train_loader,
                              model=model,
                              joint_prediction_aud=joint_prediction_aud,
                              criterion=criterion,
                              criterion_jsd=criterion_jsd,
                              criterion_ct_av=criterion_ct_av,
                              optimizer=optimizer,
                              optimizer_av=optimizer_av,
                              device=opt.device,
                              current_lr=current_lr,
                              epoch_logger=train_logger,
                              batch_logger=train_batch_logger,
                              tb_writer=tb_writer,
                              distributed=opt.distributed)
            elif optimizer_av is None and optimizer_iv is not None:
                train_i_epoch(epoch=i,
                              data_loader=train_loader,
                              model=model,
                              image_model=image_model,
                              joint_prediction_img=joint_prediction_img,
                              criterion=criterion,
                              criterion_jsd=criterion_jsd,
                              criterion_ct_iv=criterion_ct_iv,
                              optimizer=optimizer,
                              optimizer_iv=optimizer_iv,
                              device=opt.device,
                              current_lr=current_lr,
                              epoch_logger=train_logger,
                              batch_logger=train_batch_logger,
                              tb_writer=tb_writer,
                              distributed=opt.distributed,
                              image_size=image_size)
            else:
                train_ai_epoch(epoch=i,
                               data_loader=train_loader,
                               model=model,
                               image_model=image_model,
                               joint_prediction_aud=joint_prediction_aud,
                               joint_prediction_img=joint_prediction_img,
                               criterion=criterion,
                               criterion_jsd=criterion_jsd,
                               criterion_ct_av=criterion_ct_av,
                               criterion_ct_iv=criterion_ct_iv,
                               optimizer=optimizer,
                               optimizer_av=optimizer_av,
                               optimizer_iv=optimizer_iv,
                               device=opt.device,
                               current_lr=current_lr,
                               epoch_logger=train_logger,
                               batch_logger=train_batch_logger,
                               tb_writer=tb_writer,
                               distributed=opt.distributed,
                               image_size=image_size,
                               loss_weight=opt.loss_weight)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)
                if opt.use_audio:
                    save_file_path = opt.result_path / 'save_{}_audio.pth'.format(
                        i)
                    save_checkpoint(save_file_path, i, opt.arch,
                                    joint_prediction_aud, optimizer, scheduler)
                if opt.use_image:
                    save_file_path = opt.result_path / 'save_{}_image.pth'.format(
                        i)
                    save_checkpoint(save_file_path, i, opt.arch,
                                    joint_prediction_img, optimizer, scheduler)
            if not opt.no_val and i % opt.val_freq == 0:
                prev_val_loss, val_acc = val_epoch(i, val_loader, model,
                                                   criterion, opt.device,
                                                   val_logger, tb_writer,
                                                   opt.distributed)
                if pre_val_acc < val_acc:
                    pre_val_acc = val_acc
                    save_file_path = opt.result_path / 'save_model.pth'
                    save_checkpoint(save_file_path, i, opt.arch, model,
                                    optimizer, scheduler)

            if not opt.no_train and opt.lr_scheduler == 'multistep':
                scheduler.step()
            elif not opt.no_train and opt.lr_scheduler == 'plateau':
                if prev_val_loss is not None:
                    scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)
        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)