Example #1
0
def train_net(net,
              device,
              epochs=250,
              batch_size=4,
              lr=0.0001,
              save_cp=True,
              args=None,
              input_path=None,
              test=False):
    # put flag ^ here

    dir_img = input_path.dir_img
    dir_mask = input_path.dir_mask
    dir_valimg = input_path.dir_valimg
    dir_valmask = input_path.dir_valmask
    dir_testimg = input_path.dir_testimg
    dir_testmask = input_path.dir_testmask
    dir_externaltestimg = input_path.dir_externaltestimg
    dir_externaltestmask = input_path.dir_externaltestmask

    exp_name = args.expname
    img_scale = args.scale
    color_map = args.colormap

    dir_checkpoint = os.path.join(input_path.dir_checkpoint, exp_name)

    dataset = BasicDataset(dir_img, dir_mask, img_scale, color_map, 'train')
    dataval = BasicDataset(dir_valimg, dir_valmask, img_scale, color_map,
                           'val')
    datatest = BasicDataset(dir_testimg, dir_testmask, img_scale, color_map,
                            'test')
    dataexternaltest = BasicDataset(dir_externaltestimg, dir_externaltestmask,
                                    img_scale, color_map, 'test')

    n_val = dataval.__len__()
    n_train = dataset.__len__()

    train_loader = DataLoader(dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True,
                              drop_last=False)
    val_loader = DataLoader(dataval,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=4,
                            pin_memory=True,
                            drop_last=False)
    test_loader = DataLoader(datatest,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=4,
                             pin_memory=True,
                             drop_last=False)
    external_test_loader = DataLoader(dataexternaltest,
                                      batch_size=batch_size,
                                      shuffle=False,
                                      num_workers=4,
                                      pin_memory=True,
                                      drop_last=False)

    writer = SummaryWriter(comment=f'_EXPNAME_{exp_name}')
    global_step = 0

    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {batch_size}
        Learning rate:   {lr}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images scaling:  {img_scale}
        Color map:       {color_map}
    ''')

    optimizer = optim.Adam(net.parameters(), lr=lr, betas=(0.9, 0.999))
    criterion = nn.CrossEntropyLoss()

    net.load_state_dict(
        torch.load("checkpoints/UNet_exp0CP_epoch10.pth", map_location=device))
    net.eval()
    test_score = eval_net(net, test_loader, device, "Output/", True)

    print(test_score)
Example #2
0
def train_net(net,
              device,
              epochs=250,
              batch_size=4,
              lr=0.0001,
              save_cp=True,
              args=None,
              input_path=None):

    #assign image path
    dir_img = input_path.dir_img
    dir_mask = input_path.dir_mask
    dir_valimg = input_path.dir_valimg
    dir_valmask = input_path.dir_valmask
    dir_testimg = input_path.dir_testimg
    dir_testmask = input_path.dir_testmask
    dir_externaltestimg = input_path.dir_externaltestimg
    dir_externaltestmask = input_path.dir_externaltestmask

    #assign experimental options
    exp_name = args.expname
    img_scale = args.scale
    color_map = args.colormap

    dir_checkpoint = os.path.join(input_path.dir_checkpoint, exp_name)

    dataset = BasicDataset(dir_img, dir_mask, img_scale, color_map, 'train')
    dataval = BasicDataset(dir_valimg, dir_valmask, img_scale, color_map,
                           'val')
    datatest = BasicDataset(dir_testimg, dir_testmask, img_scale, color_map,
                            'test')
    dataexternaltest = BasicDataset(dir_externaltestimg, dir_externaltestmask,
                                    img_scale, color_map, 'test')

    # yuankai change it to automated
    # direct sizes of each training.
    n_val = dataval.__len__()
    n_train = dataset.__len__()

    train_loader = DataLoader(dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True)
    val_loader = DataLoader(dataval,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=4,
                            pin_memory=True,
                            drop_last=True)
    test_loader = DataLoader(datatest,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=4,
                             pin_memory=True,
                             drop_last=True)
    external_test_loader = DataLoader(dataexternaltest,
                                      batch_size=batch_size,
                                      shuffle=False,
                                      num_workers=4,
                                      pin_memory=True,
                                      drop_last=True)

    writer = SummaryWriter(comment=f'_EXPNAME_{exp_name}')
    global_step = 0

    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {batch_size}
        Learning rate:   {lr}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images scaling:  {img_scale}
        Color map:       {color_map}
    ''')

    #yuankai change the optimizer to Adam
    # optimizer = optim.RMSprop(net.parameters(), lr=lr, weight_decay=1e-8, momentum=0.9)
    optimizer = optim.Adam(net.parameters(), lr=lr, betas=(0.9, 0.999))
    #yuankai remove scheduler
    # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min' if net.n_classes > 1 else 'max', patience=2)
    if net.n_classes > 1:
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        net.train()

        epoch_loss = 0
        with tqdm(total=n_train,
                  desc=f'Epoch {epoch + 1}/{epochs}',
                  unit='img') as pbar:
            for batch in train_loader:
                imgs = batch['image']
                true_masks = batch['mask']
                #yuankai add true_masks_2channel for calculating dice loss
                true_masks_2channel = true_masks.unsqueeze(1)
                true_masks_2channel = torch.cat(
                    (1 - true_masks_2channel, true_masks_2channel), 1)
                true_masks_2channel = true_masks_2channel.to(
                    device=device, dtype=torch.float32)

                assert imgs.shape[1] == net.n_channels, \
                    f'Network has been defined with {net.n_channels} input channels, ' \
                    f'but loaded images have {imgs.shape[1]} channels. Please check that ' \
                    'the images are loaded correctly.'

                imgs = imgs.to(device=device, dtype=torch.float32)

                if net.n_classes == 1:
                    mask_type = torch.float32
                else:
                    mask_type = torch.long

                true_masks = true_masks.to(device=device, dtype=mask_type)
                masks_pred = net(imgs)
                loss_cross = criterion(masks_pred, true_masks)
                #yuankai add the dice loss
                loss_dice = 1 + dice_loss(masks_pred, true_masks_2channel)
                #yuankai sum the two loss together as the final loss
                loss = loss_dice + loss_cross

                epoch_loss += loss.item()
                writer.add_scalar('Loss/train', loss.item(), global_step)

                pbar.set_postfix(
                    **{
                        'loss (batch)': loss.item(),
                        'loss_cr (batch)': loss_cross.item(),
                        'loss_dsc (batch)': loss_dice.item()
                    })
                # pbar.set_postfix(**{'loss2 (batch)': loss2.item()})

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_value_(net.parameters(), 0.1)
                optimizer.step()

                pbar.update(imgs.shape[0])

        for tag, value in net.named_parameters():
            tag = tag.replace('.', '/')
            writer.add_histogram('weights/' + tag,
                                 value.data.cpu().numpy(), global_step)
            writer.add_histogram('grads/' + tag,
                                 value.grad.data.cpu().numpy(), global_step)
        val_score = eval_net(net, val_loader, device)
        test_score = eval_net(net, test_loader, device)
        external_test_score = eval_net(net, external_test_loader, device)
        # scheduler.step(val_score)

        writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'],
                          global_step)

        logging.info('Finish Epoch %d/%d' % (epoch, epochs))
        logging.info('Validation Dice Coeff: {}'.format(val_score))
        writer.add_scalar('Dice/test', val_score, global_step)
        logging.info('Internal Testing Dice Coeff: {}'.format(test_score))
        writer.add_scalar('Loss/test', test_score, global_step)
        logging.info(
            'External Testing Dice Coeff: {}'.format(external_test_score))
        writer.add_scalar('Loss/test', external_test_score, global_step)

        writer.add_images('images', imgs, global_step)
        if net.n_classes == 1:
            writer.add_images('masks/true', true_masks, global_step)
            writer.add_images('masks/pred',
                              torch.sigmoid(masks_pred) > 0.5, global_step)
        else:
            writer.add_images('masks/true', true_masks.unsqueeze(1),
                              global_step)
            writer.add_images('masks/pred',
                              masks_pred.max(dim=1)[1].unsqueeze(1),
                              global_step)
            # writer.add_images('masks/pred', torch.sigmoid(masks_pred) > 0.5, global_step)

        if not os.path.exists(dir_checkpoint):
            os.makedirs(dir_checkpoint)

        csv_file_name = os.path.join(dir_checkpoint,
                                     '%s_result_log.csv' % exp_name)
        convert_result_to_csv(
            [epoch, val_score, test_score, external_test_score], csv_file_name)

        if save_cp and (epoch + 1) % 5 == 0:
            try:
                os.mkdir(dir_checkpoint)
                logging.info('Created checkpoint directory')
            except OSError:
                pass
            torch.save(net.state_dict(),
                       dir_checkpoint + f'CP_epoch{epoch + 1}.pth')
            logging.info(f'Checkpoint {epoch + 1} saved !')

    writer.close()
Example #3
0
def train_net(net,
              device,
              epochs=250,
              batch_size=4,
              lr=0.0001,
              val_percent=0.2,
              save_cp=True,
              img_scale=1):

    dataset = BasicDataset(dir_img, dir_mask, False, img_scale)

    # pre-processed data: (flip, noise, augmentation, etc):
    datasetAug = BasicDataset(dir_img, dir_mask, True, img_scale)

    dataval = BasicDataset(dir_valimg, dir_valmask, False, img_scale)
    # pre-processed data: (flip, noise, augmentation, etc):
    datavalAug = BasicDataset(dir_valimg, dir_valmask, True, img_scale)

    # increasing sample size with augmented data.
    dataset = dataset + datasetAug
    dataval = dataval + datavalAug

    # yuankai change it to automated
    # direct sizes of each training.
    n_val = dataval.__len__()
    n_train = dataset.__len__()

    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
    val_loader = DataLoader(dataval, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=True)

    writer = SummaryWriter(comment=f'LR_{lr}_BS_{batch_size}_SCALE_{img_scale}')
    global_step = 0

    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {batch_size}
        Learning rate:   {lr}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Images scaling:  {img_scale}
    ''')

    #yuankai change the optimizer to Adam
    # optimizer = optim.RMSprop(net.parameters(), lr=lr, weight_decay=1e-8, momentum=0.9)
    optimizer = optim.Adam(net.parameters(), lr=lr, betas=(0.9, 0.999))
    #yuankai remove scheduler
    # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min' if net.n_classes > 1 else 'max', patience=2)
    if net.n_classes > 1:
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        net.train()

        epoch_loss = 0
        with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img') as pbar:
            for batch in train_loader:
                imgs = batch['image']
                true_masks = batch['mask']

                #yuankai add true_masks_2channel for calculating dice loss
                true_masks_2channel = true_masks.unsqueeze(1)
                true_masks_2channel = torch.cat((~true_masks_2channel, true_masks_2channel), 1)
                true_masks_2channel = true_masks_2channel.to(device=device, dtype=torch.float32)

                assert imgs.shape[1] == net.n_channels, \
                    f'Network has been defined with {net.n_channels} input channels, ' \
                    f'but loaded images have {imgs.shape[1]} channels. Please check that ' \
                    'the images are loaded correctly.'

                imgs = imgs.to(device=device, dtype=torch.float32)

                if net.n_classes == 1:
                    mask_type = torch.float32
                else:
                    mask_type = torch.long

                true_masks = true_masks.to(device=device, dtype=mask_type)
                masks_pred = net(imgs)
                loss_cross = criterion(masks_pred, true_masks)
                #yuankai add the dice loss
                loss_dice = 1 + dice_loss(masks_pred, true_masks_2channel)
                #yuankai sum the two loss together as the final loss
                loss = loss_dice + loss_cross

                epoch_loss += loss.item()
                writer.add_scalar('Loss/train', loss.item(), global_step)

                pbar.set_postfix(**{'loss (batch)': loss.item(), 'loss_cr (batch)': loss_cross.item(), 'loss_dsc (batch)': loss_dice.item()})
                # pbar.set_postfix(**{'loss2 (batch)': loss2.item()})

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_value_(net.parameters(), 0.1)
                optimizer.step()



                pbar.update(imgs.shape[0])

                global_step += 1
                if global_step % (len(dataset) // (10 * batch_size)) == 0:
                    for tag, value in net.named_parameters():
                        tag = tag.replace('.', '/')
                        writer.add_histogram('weights/' + tag, value.data.cpu().numpy(), global_step)
                        writer.add_histogram('grads/' + tag, value.grad.data.cpu().numpy(), global_step)
                    val_score = eval_net(net, val_loader, device)
                    # scheduler.step(val_score)

                    writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step)

                    if net.n_classes > 1:
                        logging.info('Validation Dice Coeff: {}'.format(val_score))
                        writer.add_scalar('Loss/test', val_score, global_step)
                    else:
                        logging.info('Validation Dice Coeff: {}'.format(val_score))
                        writer.add_scalar('Dice/test', val_score, global_step)

                    writer.add_images('images', imgs, global_step)
                    if net.n_classes == 1:
                        writer.add_images('masks/true', true_masks, global_step)
                        writer.add_images('masks/pred', torch.sigmoid(masks_pred) > 0.5, global_step)
                    else:
                        writer.add_images('masks/true', true_masks.unsqueeze(1), global_step)
                        writer.add_images('masks/pred', masks_pred.max(dim=1)[1].unsqueeze(1), global_step)
                        # writer.add_images('masks/pred', torch.sigmoid(masks_pred) > 0.5, global_step)

        if save_cp and (epoch + 1) % 10 == 0:
            try:
                os.mkdir(dir_checkpoint)
                logging.info('Created checkpoint directory')
            except OSError:
                pass
            torch.save(net.state_dict(),
                       dir_checkpoint + f'CP_epoch{epoch + 1}.pth')
            logging.info(f'Checkpoint {epoch + 1} saved !')

    writer.close()