Ejemplo n.º 1
0
def main(args):
    """Main function for the training pipeline

    :args: commandlien arguments
    :returns: None

    """
    ##########################################################################
    #                             Basic settings                             #
    ##########################################################################
    exp_dir = 'experiments'
    log_dir = os.path.join(exp_dir, 'logs')
    model_dir = os.path.join(exp_dir, 'models')
    os.makedirs(model_dir, exist_ok=True)

    ##########################################################################
    #  Define all the necessary variables for model training and evaluation  #
    ##########################################################################
    writer = SummaryWriter(log_dir)
    train_dataset = dataset.NCovDataset('data/', stage='train')
    weights = train_dataset.make_weights_for_balanced_classes()
    weights = torch.DoubleTensor(weights)
    sampler = torch.utils.data.sampler.WeightedRandomSampler(
        weights, len(train_dataset.case_ids))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=1,
                                               num_workers=20,
                                               drop_last=False,
                                               sampler=sampler)

    val_dataset = dataset.NCovDataset('data/', stage='val')
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=11,
                                             drop_last=False)

    cov_net = model.COVNet(n_classes=3)
    if torch.cuda.is_available():
        cov_net = cov_net.cuda()
    optimizer = optim.Adam(cov_net.parameters(), lr=args.lr, weight_decay=0.1)

    if args.lr_scheduler == "plateau":
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               patience=3,
                                                               factor=.3,
                                                               threshold=1e-4,
                                                               verbose=True)
    elif args.lr_scheduler == "step":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=3,
                                                    gamma=args.gamma)

    best_val_loss = float('inf')
    best_val_accu = float(0)

    iteration_change_loss = 0
    t_start_training = time.time()
    ##########################################################################
    #                           Main training loop                           #
    ##########################################################################
    for epoch in range(args.epochs):
        current_lr = get_lr(optimizer)
        t_start = time.time()

        ############################################################
        #  The actual training and validation step for each epoch  #
        ############################################################
        train_loss, train_metric = train_model(cov_net, train_loader, epoch,
                                               args.epochs, optimizer, writer,
                                               current_lr, args.log_every)

        with torch.no_grad():
            val_loss, val_metric = evaluate_model(cov_net, val_loader, epoch,
                                                  args.epochs, writer,
                                                  current_lr)

        ##############################
        #  Adjust the learning rate  #
        ##############################
        if args.lr_scheduler == 'plateau':
            scheduler.step(val_loss)
        elif args.lr_scheduler == 'step':
            scheduler.step()

        t_end = time.time()
        delta = t_end - t_start

        utils.print_epoch_progress(train_loss, val_loss, delta, train_metric,
                                   val_metric)
        iteration_change_loss += 1
        print('-' * 30)

        train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy']
        file_name = ('train_acc_{}_val_acc_{}_epoch_{}.pth'.format(
            train_acc, val_acc, epoch))
        torch.save(cov_net, os.path.join(model_dir, file_name))

        if val_acc > best_val_accu:
            best_val_accu = val_acc
            if bool(args.save_model):
                torch.save(cov_net, os.path.join(model_dir, 'best.pth'))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            iteration_change_loss = 0

        if iteration_change_loss == args.patience:
            print(
                ('Early stopping after {0} iterations without the decrease ' +
                 'of the val loss').format(iteration_change_loss))
            break
    t_end_training = time.time()
    print('training took {}s'.format(t_end_training - t_start_training))
Ejemplo n.º 2
0
def main(opts):
    """Main function for the training pipeline
    :opts: commandline arguments
    :returns: None
    """
    pprint(vars(opts))
    ##########################################################################
    #                             Basic settings                             #
    ##########################################################################
    exp_dir = 'experiments'
    log_dir = os.path.join(exp_dir, 'logs')
    model_dir = os.path.join(exp_dir, 'models')
    os.makedirs(model_dir, exist_ok=True)

    os.makedirs(os.path.join(log_dir, opts.run_name), exist_ok=True)
    with open(os.path.join(log_dir, opts.run_name, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    ##########################################################################
    #  Define all the necessary variables for model training and evaluation  #
    ##########################################################################
    writer = SummaryWriter(os.path.join(log_dir, opts.run_name))

    train_dataset = SegmentationDataset(is_train=True)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=opts.batch_size,
                                               num_workers=4,
                                               drop_last=False,
                                               shuffle=True)

    val_dataset = SegmentationDataset(is_train=False)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=20,
                                             shuffle=False,
                                             num_workers=0,
                                             drop_last=False)

    model = load_model(opts, n_classes=4)

    if torch.cuda.is_available():
        model = model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=opts.lr, weight_decay=0.1)

    if opts.lr_scheduler == "plateau":
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            patience=opts.patience,
            factor=.3,
            threshold=0.1,
            verbose=True)
    elif opts.lr_scheduler == "step":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=3,
                                                    gamma=opts.gamma)

    best_val_loss = float('inf')
    best_val_accu = float(0)

    iteration_change_loss = 0
    t_start_training = time.time()
    ##########################################################################
    #                           Main training loop                           #
    ##########################################################################
    for epoch in range(opts.epochs):
        t_start = time.time()

        ############################################################
        #  The actual training and validation step for each epoch  #
        ############################################################
        train_loss, train_metric = train_model(model, train_loader, epoch,
                                               optimizer, writer, opts)

        with torch.no_grad():
            val_loss, val_metric = evaluate_model(model, val_loader, epoch,
                                                  writer, opts)

            ##############################
            #  Adjust the learning rate  #
            ##############################
            if opts.lr_scheduler == 'plateau':
                scheduler.step(val_loss)
            elif opts.lr_scheduler == 'step':
                scheduler.step()

        t_end = time.time()
        delta = t_end - t_start

        utils.print_epoch_progress(epoch, opts.epochs, train_loss, val_loss,
                                   delta, train_metric, val_metric)

    t_end_training = time.time()
    print('training took {}s'.format(t_end_training - t_start_training))
Ejemplo n.º 3
0
def main(args):

    print("The parameters are set as follows:")
    print(args)
    # 首先处理文件保存路径
    exp_dir = 'experiments'
    model_dir = os.path.join(exp_dir, 'models')
    os.makedirs(model_dir, exist_ok=True)

    batch_size = args.batch_size
    train_set = SliceSet2D()

    validation_split = .4
    shuffle_dataset = True
    random_seed = 42
    set_size = len(train_set)
    indices = list(range(set_size))

    split = int(np.floor(validation_split * set_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]
    print("Set Size: {}|Train Size: {}| Validation Size: {}".format(
        len(indices), len(train_indices), len(val_indices)))
    print("Model will be saved in \" ./experiments/model\". Start training...")
    # Creating data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(train_set,
                                                    batch_size=batch_size,
                                                    sampler=valid_sampler)
    cov_net = model.COVNet(3)

    if torch.cuda.is_available():
        cov_net = cov_net.cuda()

    optimizer = Adam(cov_net.parameters(), lr=args.lr, weight_decay=0.1)

    if args.lr_scheduler == "plateau":
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               patience=3,
                                                               factor=.3,
                                                               threshold=1e-4,
                                                               verbose=True)
    elif args.lr_scheduler == "step":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=3,
                                                    gamma=args.gamma)

    best_val_loss = float('inf')
    best_val_accu = float(0)

    iteration_change_loss = 0
    t_start_training = time.time()

    ##########################################################################
    #                           Main training loop                           #
    ##########################################################################

    epochs = args.epochs

    for epoch in range(epochs):
        current_lr = get_lr(optimizer)
        t_start = time.time()

        ############################################################
        #  The actual training and validation step for each epoch  #
        ############################################################
        train_loss, train_metric = train_model(cov_net, train_loader, epoch,
                                               epochs, optimizer, current_lr,
                                               100)

        with torch.no_grad():
            val_loss, val_metric = evaluate_model(cov_net, validation_loader,
                                                  epoch, epochs, current_lr)

        ##############################
        #  Adjust the learning rate  #
        ##############################
        if args.lr_scheduler == 'plateau':
            scheduler.step(val_loss)
        elif args.lr_scheduler == 'step':
            scheduler.step()

        t_end = time.time()
        delta = t_end - t_start

        utils.print_epoch_progress(epoch, epochs, train_loss, val_loss, delta,
                                   train_metric, val_metric)
        iteration_change_loss += 1
        print('-' * 60)
        train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy']
        if args.save_model is True:
            file_name = ('train_acc_{}_val_acc_{}_epoch_{}.pth'.format(
                train_acc, val_acc, epoch))
            torch.save(cov_net, os.path.join(model_dir, file_name))

        if val_acc > best_val_accu:
            best_val_accu = val_acc
            if bool(args.save_model):
                torch.save(cov_net, os.path.join(model_dir, 'best.pth'))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            iteration_change_loss = 0

        if iteration_change_loss == args.patience:
            print(
                ('Early stopping after {0} iterations without the decrease ' +
                 'of the val loss').format(iteration_change_loss))
            break
    t_end_training = time.time()
    print('training took {}s'.format(t_end_training - t_start_training))
Ejemplo n.º 4
0
def main(opts):
    """Main function for the training pipeline
    :opts: commandlien arguments
    :returns: None
    """
    ##########################################################################
    #                             Basic settings                             #
    ##########################################################################
    exp_dir = 'experiments'
    log_dir = os.path.join(exp_dir, 'logs')
    model_dir = os.path.join(exp_dir, 'models')
    os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True)
    os.makedirs(os.path.join(log_dir, opts.run_name))

    pprint(vars(opts))
    with open(os.path.join(log_dir, opts.run_name, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    torch.manual_seed(opts.seed)
    np.random.seed(opts.seed)
    random.seed(opts.seed)

    ##########################################################################
    #  Define all the necessary variables for model training and evaluation  #
    ##########################################################################
    writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5)

    if opts.train_mode == 'combined':
        train_dataset = get_train_dataset(opts.data_root, opts, opts.folder1,
                                          opts.folder2, opts.folder3)
    elif opts.train_mode == 'oversampling':
        train_dataset = get_train_dataset_by_oversampling(
            opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3)
    elif opts.train_mode == 'pretrain_and_finetune':
        train_dataset, finetune_dataset = get_pretrain_and_finetune_datast(
            opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3)
        finetune_loader = torch.utils.data.DataLoader(
            finetune_dataset,
            batch_size=opts.batch_size,
            num_workers=opts.num_workers,
            drop_last=False,
            shuffle=True)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=opts.batch_size,
                                               num_workers=opts.num_workers,
                                               drop_last=False,
                                               shuffle=True)

    val_dataset = get_val_dataset(os.path.join('data', 'val'), opts)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=opts.eval_batch_size,
                                             shuffle=False,
                                             num_workers=opts.num_workers,
                                             drop_last=False)

    test_dataset = get_test_dataset(os.path.join('data', 'test'), opts)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=opts.eval_batch_size,
                                              shuffle=False,
                                              num_workers=opts.num_workers,
                                              drop_last=False)

    assert train_dataset.class_to_idx == val_dataset.class_to_idx == test_dataset.class_to_idx, "Mapping not correct"

    model = get_model(opts)

    opts.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if torch.cuda.device_count() > 1 and not opts.no_data_parallel:
        model = nn.DataParallel(model)

    model = model.to(opts.device)

    optimizer = optim.RMSprop(model.parameters(),
                              lr=opts.lr,
                              alpha=0.9,
                              weight_decay=1e-5,
                              momentum=0.9)
    scheduler = get_lr_scheduler(optimizer, opts)

    best_val_loss = float('inf')
    best_val_accu = float(0)
    best_val_rec = float(0)
    best_val_prec = float(0)
    best_val_f1 = float(0)
    best_val_auc = float(0)

    iteration_change_loss = 0
    t_start_training = time.time()

    ##########################################################################
    #                           Main training loop                           #
    ##########################################################################
    for epoch in range(opts.epochs):
        current_lr = get_lr(optimizer)
        t_start = time.time()

        ############################################################
        #  The actual training and validation step for each epoch  #
        ############################################################
        train_loss, train_metric = train_model(model, train_loader, optimizer,
                                               opts)

        if epoch == opts.finetune_epoch and opts.train_mode == 'pretrain_and_finetune':
            train_loader = finetune_loader
            optimizer = optim.RMSprop(model.parameters(),
                                      lr=opts.lr,
                                      alpha=0.9,
                                      weight_decay=1e-5,
                                      momentum=0.9)
            scheduler = torch.optim.lr_scheduler.StepLR(
                optimizer,
                step_size=opts.step_size_finetuning,
                gamma=opts.gamma)

        # Run the validation set
        with torch.no_grad():
            val_loss, val_metric = evaluate_model(model, val_loader, opts)

        ##############################
        #  Write to summary writer   #
        ##############################

        train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy']
        train_rec, val_rec = train_metric['recalls'], val_metric['recalls']
        train_prec, val_prec = train_metric['precisions'], val_metric[
            'precisions']
        train_f1, val_f1 = train_metric['f1'], val_metric['f1']
        train_auc, val_auc = train_metric['auc'], val_metric['auc']

        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Accuracy/Train', train_acc, epoch)
        writer.add_scalar('Precision/Train', train_prec, epoch)
        writer.add_scalar('Recall/Train', train_rec, epoch)
        writer.add_scalar('F1/Train', train_f1, epoch)
        writer.add_scalar('AUC/Train', train_auc, epoch)

        writer.add_scalar('Loss/Val', val_loss, epoch)
        writer.add_scalar('Accuracy/Val', val_acc, epoch)
        writer.add_scalar('Precision/Val', val_prec, epoch)
        writer.add_scalar('Recall/Val', val_rec, epoch)
        writer.add_scalar('F1/Val', val_f1, epoch)
        writer.add_scalar('AUC/Val', val_auc, epoch)

        ##############################
        #  Adjust the learning rate  #
        ##############################
        if opts.lr_scheduler == 'plateau':
            scheduler.step(val_loss)
        elif opts.lr_scheduler in ['step', 'cosine']:
            scheduler.step()

        t_end = time.time()
        delta = t_end - t_start

        print_epoch_progress(epoch, opts.epochs, train_loss, val_loss, delta,
                             train_metric, val_metric)
        iteration_change_loss += 1
        print('-' * 30)

        if val_acc > best_val_accu:
            best_val_accu = val_acc
            if bool(opts.save_model):
                torch.save(
                    model.state_dict(),
                    os.path.join(model_dir, opts.run_name,
                                 'best_state_dict.pth'))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            iteration_change_loss = 0

        if val_rec > best_val_rec:
            best_val_rec = val_rec

        if val_prec > best_val_prec:
            best_val_prec = val_prec

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            print(f'The best validation F1-score is now {best_val_f1}')
            print(
                f'The validation accuracy and AUC are now {val_acc} and {val_auc}'
            )

        if val_auc > best_val_auc:
            best_val_auc = val_auc

        if iteration_change_loss == opts.patience and opts.early_stopping:
            print(
                ('Early stopping after {0} iterations without the decrease ' +
                 'of the val loss').format(iteration_change_loss))
            break

    t_end_training = time.time()
    print(f'training took {t_end_training - t_start_training}s')
    print(f'Best validation accuracy: {best_val_accu}')
    print(f'Best validation loss: {best_val_loss}')
    print(f'Best validation precision: {best_val_prec}')
    print(f'Best validation recall: {best_val_rec}')
    print(f'Best validation f1: {best_val_f1}')
    print(f'Best validation AUC: {best_val_auc}')

    with torch.no_grad():
        if opts.train_mode in ['combined', 'oversampling']:
            model.load_state_dict(
                torch.load(
                    os.path.join(model_dir, opts.run_name,
                                 'best_state_dict.pth')))
        test_loss, test_metric = evaluate_model(model, test_loader, opts)

    print(f'The best test F1: {test_metric["f1"]}')
    print(f'The best test auc: {test_metric["auc"]}')
    print(f'The best test accuracy: {test_metric["accuracy"]}')
Ejemplo n.º 5
0
def main(opts):
    """Main function for the training pipeline
    :opts: commandlien arguments
    :returns: None
    """
    ##########################################################################
    #                             Basic settings                             #
    ##########################################################################
    exp_dir = 'experiments'
    log_dir = os.path.join(exp_dir, 'logs')
    model_dir = os.path.join(exp_dir, 'models')
    os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True)

    ##########################################################################
    #  Define all the necessary variables for model training and evaluation  #
    ##########################################################################
    writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5)

    train_dataset = get_train_dataset(root=os.path.join('data', 'train'))
    weights = make_weights_for_balanced_classes(train_dataset.imgs,
                                                len(train_dataset.classes))
    weights = torch.DoubleTensor(weights)
    sampler = torch.utils.data.sampler.WeightedRandomSampler(
        weights, len(weights))
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=opts.batch_size,
                                               num_workers=6,
                                               drop_last=False,
                                               sampler=sampler)

    val_dataset = get_val_dataset(root=os.path.join('data', 'val'))
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=opts.batch_size,
                                             shuffle=False,
                                             num_workers=6,
                                             drop_last=False)

    assert train_dataset.class_to_idx == val_dataset.class_to_idx, "Mapping not correct"

    model = load_baseline(n_classes=2)

    if torch.cuda.is_available():
        model = model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=opts.lr, weight_decay=0.1)

    if opts.lr_scheduler == "plateau":
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               patience=3,
                                                               factor=.3,
                                                               threshold=1e-4,
                                                               verbose=True)
    elif opts.lr_scheduler == "step":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=3,
                                                    gamma=opts.gamma)

    best_val_loss = float('inf')
    best_val_accu = float(0)

    iteration_change_loss = 0
    t_start_training = time.time()

    ##########################################################################
    #                           Main training loop                           #
    ##########################################################################
    for epoch in range(opts.epochs):
        current_lr = get_lr(optimizer)
        t_start = time.time()

        ############################################################
        #  The actual training and validation step for each epoch  #
        ############################################################
        train_loss, train_metric = train_model(model, train_loader, epoch,
                                               opts.epochs, optimizer, writer,
                                               current_lr, opts.log_every)

        with torch.no_grad():
            val_loss, val_metric = evaluate_model(model, val_loader, epoch,
                                                  opts.epochs, writer,
                                                  current_lr)

        ##############################
        #  Write to summary writer   #
        ##############################

        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Accuracy/Train', train_metric['accuracy'], epoch)
        writer.add_scalar('Precision/Train', train_metric['precisions'], epoch)
        writer.add_scalar('Recall/Train', train_metric['recalls'], epoch)
        writer.add_scalar('F1/Train', train_metric['f1'], epoch)

        writer.add_scalar('Loss/Val', val_loss, epoch)
        writer.add_scalar('Accuracy/Val', val_metric['accuracy'], epoch)
        writer.add_scalar('Precision/Val', val_metric['precisions'], epoch)
        writer.add_scalar('Recall/Val', val_metric['recalls'], epoch)
        writer.add_scalar('F1/Val', val_metric['f1'], epoch)

        ##############################
        #  Adjust the learning rate  #
        ##############################
        if opts.lr_scheduler == 'plateau':
            scheduler.step(val_loss)
        elif opts.lr_scheduler == 'step':
            scheduler.step()

        t_end = time.time()
        delta = t_end - t_start

        print_epoch_progress(train_loss, val_loss, delta, train_metric,
                             val_metric)
        iteration_change_loss += 1
        print('-' * 30)

        train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy']
        # file_name = ('val_acc_{}_train_acc_{}_epoch_{}.pth'.
        #              format(train_acc, val_acc, epoch))
        # torch.save(model, os.path.join(model_dir, opts.run_name, file_name))

        if val_acc > best_val_accu:
            best_val_accu = val_acc
            if bool(opts.save_model):
                torch.save(model,
                           os.path.join(model_dir, opts.run_name, 'best.pth'))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            iteration_change_loss = 0

        if iteration_change_loss == opts.patience and opts.early_stopping:
            print(
                ('Early stopping after {0} iterations without the decrease ' +
                 'of the val loss').format(iteration_change_loss))
            break
    t_end_training = time.time()
    print('training took {}s'.format(t_end_training - t_start_training))