Beispiel #1
0
def main():
    if config.gpu and not torch.cuda.is_available():
        raise ValueError("GPU not supported or enabled on this system.")
    use_gpu = config.gpu

    log.info("Loading train dataset")
    train_dataset = COVIDxFolder(
        config.train_imgs, config.train_labels,
        transforms.train_transforms(config.width, config.height))
    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch_size,
                              shuffle=True,
                              drop_last=True,
                              num_workers=config.n_threads,
                              pin_memory=use_gpu)
    log.info("Number of training examples {}".format(len(train_dataset)))

    log.info("Loading val dataset")
    val_dataset = COVIDxFolder(
        config.val_imgs, config.val_labels,
        transforms.val_transforms(config.width, config.height))
    val_loader = DataLoader(val_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            num_workers=config.n_threads,
                            pin_memory=use_gpu)
    log.info("Number of validation examples {}".format(len(val_dataset)))

    if config.weights:
        #state = torch.load(config.weights)
        state = None
        log.info("Loaded model weights from: {}".format(config.weights))
    else:
        state = None

    state_dict = state["state_dict"] if state else None
    model = architecture.COVIDNext50(n_classes=config.n_classes)
    if state_dict:
        model = util.load_model_weights(model=model, state_dict=state_dict)

    if use_gpu:
        model.cuda()
        model = torch.nn.DataParallel(model)
    optim_layers = filter(lambda p: p.requires_grad, model.parameters())

    # optimizer and lr scheduler
    optimizer = Adam(optim_layers,
                     lr=config.lr,
                     weight_decay=config.weight_decay)
    scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                  factor=config.lr_reduce_factor,
                                  patience=config.lr_reduce_patience,
                                  mode='max',
                                  min_lr=1e-7)

    # Load the last global_step from the checkpoint if existing
    global_step = 0 if state is None else state['global_step'] + 1

    class_weights = util.to_device(torch.FloatTensor(config.loss_weights),
                                   gpu=use_gpu)
    loss_fn = CrossEntropyLoss()

    # Reset the best metric score
    best_score = -1
    for epoch in range(config.epochs):
        log.info("Started epoch {}/{}".format(epoch + 1, config.epochs))
        for data in train_loader:
            imgs, labels = data
            imgs = util.to_device(imgs, gpu=use_gpu)
            labels = util.to_device(labels, gpu=use_gpu)

            logits = model(imgs)
            loss = loss_fn(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if global_step % config.log_steps == 0 and global_step > 0:
                probs = model.module.probability(logits)
                preds = torch.argmax(probs, dim=1).detach().cpu().numpy()
                labels = labels.cpu().detach().numpy()
                acc, f1, _, _ = util.clf_metrics(preds, labels)
                lr = util.get_learning_rate(optimizer)

                log.info("Step {} | TRAINING batch: Loss {:.4f} | F1 {:.4f} | "
                         "Accuracy {:.4f} | LR {:.2e}".format(
                             global_step, loss.item(), f1, acc, lr))

            if global_step % config.eval_steps == 0 and global_step > 0:
                best_score = validate(val_loader,
                                      model,
                                      best_score=best_score,
                                      global_step=global_step,
                                      cfg=config)
                scheduler.step(best_score)
            global_step += 1
def train_baselines():

    train_data, val_data = get_dataloader(96)

    for model, batch in zip(models, batch_size):
        name = str(model).split()[1]
        print('*****Start Training {} with batch size {}******'.format(
            name, batch))
        print(
            ' epoch   iter   rate  |  smooth_loss   |  train_loss  (acc)  |  valid_loss  (acc)  | total_train_loss\n'
        )
        logger = Logger('../log/{}'.format(name), name)

        net = model(pretrained=True)
        optimizer = get_optimizer(net,
                                  lr=.01,
                                  pretrained=True,
                                  resnet=True if 'resnet' in name else False)
        net = nn.DataParallel(net.cuda())

        train_data.batch_size = batch
        val_data.batch_size = batch

        num_epoches = 50  #100
        print_every_iter = 20
        epoch_test = 1

        # optimizer
        # optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0005)
        # optimizer = optim.Adam(net.parameters(), lr=1e-4, weight_decay=5e-4)

        smooth_loss = 0.0
        train_loss = np.nan
        train_acc = np.nan
        # test_loss = np.nan
        best_test_loss = np.inf
        # test_acc = np.nan
        t = time.time()

        for epoch in range(
                num_epoches):  # loop over the dataset multiple times

            # train loss averaged every epoch
            total_epoch_loss = 0.0

            lr_schedule(epoch, optimizer, pretrained=True)

            rate = get_learning_rate(optimizer)[0]  # check

            sum_smooth_loss = 0.0
            total_sum = 0
            sum = 0
            net.cuda().train()

            num_its = len(train_data)
            for it, (images, labels, indices) in enumerate(train_data, 0):

                logits = net(Variable(images.cuda()))
                probs = F.sigmoid(logits)
                loss = multi_criterion(logits, labels.cuda())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # additional metrics
                sum_smooth_loss += loss.data[0]
                total_epoch_loss += loss.data[0]
                sum += 1
                total_sum += 1

                # print statistics
                if it % print_every_iter == print_every_iter - 1:
                    smooth_loss = sum_smooth_loss / sum
                    sum_smooth_loss = 0.0
                    sum = 0

                    train_acc = multi_f_measure(probs.data, labels.cuda())
                    train_loss = loss.data[0]
                    print('\r{}   {}    {}   |  {}  | {}  {} | ... '.format(
                        epoch + it / num_its, it + 1, rate, smooth_loss,
                        train_loss, train_acc),
                          end='',
                          flush=True)

            total_epoch_loss = total_epoch_loss / total_sum
            if epoch % epoch_test == epoch_test - 1 or epoch == num_epoches - 1:
                net.cuda().eval()
                test_loss, test_acc = evaluate(net, val_data)
                print('\r', end='', flush=True)
                print('{}   {}    {}   |  {}  | {}  {} | {}  {} | {}'.format(
                    epoch + 1, it + 1, rate, smooth_loss, train_loss,
                    train_acc, test_loss, test_acc, total_epoch_loss))

                # save if the current loss is better
                if test_loss < best_test_loss:
                    print('save {} {}'.format(test_loss, best_test_loss))
                    torch.save(net.state_dict(),
                               '../models/{}.pth'.format(name))
                    net.load_state_dict(
                        torch.load('../models/{}.pth'.format(name)))
                    print(evaluate(net, val_data))

                    best_test_loss = test_loss

            logger.add_record('train_loss', total_epoch_loss)
            logger.add_record('evaluation_loss', test_loss)
            logger.add_record('f2_score', test_acc)

            logger.save()
            logger.save_plot()
            logger.save_time(start_time=t, end_time=time.time())
def train_baselines():
    train_data, val_data = get_dataloader(96)

    for model, batch in zip(models, batch_size):
        name = str(model).split()[1]
        print('*****Start Training {} with batch size {}******'.format(
            name, batch))
        print(
            ' epoch   iter   rate  |  smooth_loss   |  train_loss  (acc)  |  valid_loss  (acc)  | total_train_loss\n'
        )
        logger = Logger(
            '/mnt/home/dunan/Learn/Kaggle/planet_amazon/log/full_data_{}_10xlr_2'
            .format(name), name)

        # load pre-trained model on train-37479
        net = model(pretrained=True)
        net = nn.DataParallel(net.cuda())
        # load_net(net, name)
        # optimizer = get_optimizer(net.module, lr=.005, pretrained=True, resnet=True if 'resnet' in name else False)
        optimizer = get_optimizer(net.module,
                                  lr=.01,
                                  pretrained=True,
                                  resnet=True if 'resnet' in name else False)
        train_data.batch_size = batch
        val_data.batch_size = batch

        num_epoches = 60
        print_every_iter = 20
        epoch_test = 1

        smooth_loss = 0.0
        train_loss = np.nan
        train_acc = np.nan
        best_test_loss = np.inf
        t = time.time()

        for epoch in range(
                num_epoches):  # loop over the dataset multiple times

            # train loss averaged every epoch
            total_epoch_loss = 0.0

            # lr_schedule(epoch, optimizer, base_lr=0.005, pretrained=True)
            new_lr_schedule(epoch, optimizer)

            rate = get_learning_rate(optimizer)[0]  # check

            sum_smooth_loss = 0.0
            total_sum = 0
            sum = 0
            net.cuda().train()

            num_its = len(train_data)
            for it, (images, labels, indices) in enumerate(train_data, 0):

                logits = net(Variable(images.cuda()))
                probs = F.sigmoid(logits)
                loss = multi_criterion(logits, labels.cuda())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # additional metrics
                sum_smooth_loss += loss.data[0]
                total_epoch_loss += loss.data[0]
                sum += 1
                total_sum += 1

                # print statistics
                if it % print_every_iter == print_every_iter - 1:
                    smooth_loss = sum_smooth_loss / sum
                    sum_smooth_loss = 0.0
                    sum = 0

                    train_acc = multi_f_measure(probs.data, labels.cuda())
                    train_loss = loss.data[0]
                    print(
                        '\r{}   {}    {}   |  {}  | {}  {} | ... '.format(
                            epoch + it / num_its, it + 1, rate, smooth_loss,
                            train_loss, train_acc), )

            total_epoch_loss = total_epoch_loss / total_sum
            if epoch % epoch_test == epoch_test - 1 or epoch == num_epoches - 1:
                net.cuda().eval()
                test_loss, test_acc = evaluate(net, val_data)
                print('\r')
                print('{}   {}    {}   |  {}  | {}  {} | {}  {} | {}'.format(
                    epoch + 1, it + 1, rate, smooth_loss, train_loss,
                    train_acc, test_loss, test_acc, total_epoch_loss))

                # save if the current loss is better
                if test_loss < best_test_loss:
                    print('save {} {}'.format(test_loss, best_test_loss))
                    torch.save(
                        net.state_dict(),
                        '/mnt/home/dunan/Learn/Kaggle/planet_amazon/model/full_data_{}_10xlr_2.pth'
                        .format(name))
                    best_test_loss = test_loss

            logger.add_record('train_loss', total_epoch_loss)
            logger.add_record('evaluation_loss', test_loss)
            logger.add_record('f2_score', test_acc)

            logger.save()
            logger.save_plot()
            logger.save_time(start_time=t, end_time=time.time())
Beispiel #4
0
def train(config_path):
    """ Trains a model for a maximum of config.max_epochs epochs

    Args:
    config_path: string, path to a config.json file

    """

    # Load configuration
    if not os.path.exists(config_path):
        print 'Error: No configuration file present at specified path.'
        return
        
    config = util.load_config(config_path)
    print 'Loaded configuration from: %s' % config_path

    # Create session directory
    if 'session_dir' not in config['training'] or os.path.exists(config['training']['session_dir']): create_new_session(config)

    # Direct all output to screen and log file
    util.set_print_to_screen_and_file(
        os.path.join(config['training']['session_dir'], 'session.log'))

    model = fcpn.FCPN(config)
    dataset = data.Dataset(config)
    dataset.prepare(config['dataset']['refresh_cache'])

    config['model']['pointnet']['num'] = np.prod(model.get_feature_volume_shape(
        config['dataset']['training_samples']['spatial_size'], config['model']['pointnet']['spacing'], 1))

    enqueue_op, queue_placeholders, queue_batch_placeholders, get_queue_size_op = setup_queue(
        config['dataset']['training_samples']['num_points'] + config['model']['pointnet']['num'], dataset.get_num_output_voxels(), config['training']['batch_size'])

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = config['training']['gpu']['allow_growth']
    tf_config.allow_soft_placement = config['training']['gpu']['allow_soft_placement']

    sess = tf.Session(config=tf_config)

    with sess.as_default():
        with tf.device('/gpu:' + str(config['training']['gpu']['id'])):

            # Batch normalization
            batch_i = tf.Variable(0, name='batch_i')
            batch_normalization_decay = util.get_batch_normalization_decay(
                batch_i, config['training']['batch_size'], config['training']['optimizer']['batch_normalization']['initial_decay'], config['training']['optimizer']['batch_normalization']['decay_rate'], config['training']['optimizer']['batch_normalization']['decay_step'])
            tf.summary.scalar('batch_normalization_decay',
                              batch_normalization_decay)
            is_training_pl = tf.placeholder(tf.bool, shape=())
            
            # Build model
            pred_op = model.build_model(config['training']['batch_size'], config['dataset']['training_samples']['spatial_size'], queue_batch_placeholders['input_points_pl'],
                                        queue_batch_placeholders['input_features_pl'], is_training_pl, dataset.get_num_learnable_classes(), batch_normalization_decay)
            
            # Loss
            loss_op = model.get_loss(
                pred_op, queue_batch_placeholders['output_voxels_pl'], queue_batch_placeholders['output_voxel_weights_pl'])

            model.print_num_parameters()
            model.print_layer_weights()

            # Confusion matrix
            confusion_matrix_op, confusion_matrix_update_op, confusion_matrix_clear_op = model.get_confusion_matrix_ops(
                pred_op, queue_batch_placeholders['output_voxels_pl'], dataset.get_num_learnable_classes(), dataset.get_empty_class())

            # Optimizer
            learning_rate_op = util.get_learning_rate(
                batch_i, config['training']['batch_size'], config['training']['optimizer']['learning_rate']['initial'], config['training']['optimizer']['learning_rate']['decay_rate'], config['training']['optimizer']['learning_rate']['decay_step'])
            tf.summary.scalar('learning_rate', learning_rate_op)

            optimizer_op = tf.train.AdamOptimizer(learning_rate_op)
            if config['training']['train_upsampling_only']:
                upsampling_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "upsampling")
                optimization_op = optimizer_op.minimize(loss_op, var_list=upsampling_weights, global_step=batch_i)
            else:
                optimization_op = optimizer_op.minimize(loss_op, global_step=batch_i)

            # Summary and Saving
            saver = tf.train.Saver(max_to_keep=config['training']['checkpoints_to_keep'])
            merged_summary_op = tf.summary.merge_all()
            summary_writers = {
                'train': tf.summary.FileWriter(os.path.join(config['training']['session_dir'], 'train'), sess.graph),
                'val': tf.summary.FileWriter(os.path.join(config['training']['session_dir'], 'val'))
            }

            # Initialize variables in graph
            init_g = tf.global_variables_initializer()
            init_l = tf.local_variables_initializer()
            sess.run([init_g, init_l], {is_training_pl: True})

            # Restore model weights from disk
            if config['training']['checkpoint_path']:

                weights_to_be_restored = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

                # If finetuning on a new dataset, don't load last layer weights or confusion matrix
                if config['training']['finetune_new_classes']:
                    final_layer_weights =  tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="upsampling/15cm_to_5cm/final_conv")
                    confusion_variables =  tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="confusion")
                    weights_to_be_restored = list(set(weights_to_be_restored) - set(final_layer_weights) - set(confusion_variables))

                restorer = tf.train.Saver(var_list=weights_to_be_restored)
                restorer.restore(sess, config['training']['checkpoint_path'])
                print 'Model weights restored from checkpoint file: %s' % config['training']['checkpoint_path']

            num_batches = {
                'train': dataset.get_num_batches('train', config['training']['batch_size']),
                'val': dataset.get_num_batches('val', config['training']['batch_size'])
            }
            ops = {
                'train': [loss_op, merged_summary_op, optimization_op],
                'val': [loss_op, merged_summary_op, confusion_matrix_update_op]
            }

            # Start loading samples into FIFO queue
            coord, loader_thread = start_data_loader(
                sess, enqueue_op, queue_placeholders, model, dataset, config)

            # Save configuration file (with derived parameters) to session directory
            util.save_config(os.path.join(config['training']['session_dir'], 'config.json'), config)                

            # Start training
            sample_i = 0
            for epoch_i in range(config['training']['max_epochs']):
                print '\nEpoch: %d' % epoch_i

                for s in ['train', 'val']:

                    is_training = (s == 'train')

                    if s == 'train':
                        is_training = True
                        print 'Training set\nBatch/Total Batches | Loss | Items in Queue'
                    else:                        
                        print 'Validation set\nBatch/Total Batches | Loss | Items in Queue'

                    for epoch_batch_i in range(num_batches[s]):

                        loss, summary, _ = sess.run(
                            ops[s], feed_dict={is_training_pl: is_training})

                        # Log statistics
                        if epoch_batch_i % config['training']['log_every_n_batches'] == 0:
                            summary_writers[s].add_summary(summary, sample_i)
                            summary_writers[s].flush()
                            print '%i/%i | %f | %d' % (epoch_batch_i + 1, num_batches[s], loss, get_queue_size_op.eval())

                        # Only do when in training phase
                        if s == 'train':
                            sample_i += config['training']['batch_size']

                            # Save snapshot of model
                            if epoch_batch_i % config['training']['save_every_n_batches'] == 0:
                                save_path = saver.save(sess, os.path.join(
                                    config['training']['session_dir'], "model.ckpt"), global_step=epoch_i)
                                print 'Checkpoint saved at batch %d to %s' % (
                                    epoch_batch_i, save_path)

                    # Only do at the end of the validation phase
                    if s == 'train':
                        save_path = saver.save(sess, os.path.join(
                            config['training']['session_dir'], "model.ckpt"), global_step=epoch_i)
                        print 'Checkpoint saved at batch %d to %s' % (epoch_batch_i, save_path)
                    elif s == 'val':
                        confusion_matrix = confusion_matrix_op.eval()

                        # Compute and print per-class statistics
                        true_positives, false_negatives, false_positives, ious = util.compute_per_class_statistics(confusion_matrix[:dataset.get_empty_class(),:dataset.get_empty_class()])
                        util.pretty_print_confusion_matrix(confusion_matrix, dataset.get_learnable_classes_strings())
                        util.pretty_print_per_class_statistics(dataset.get_learnable_classes_strings()[:dataset.get_empty_class()], true_positives, false_negatives, false_positives, ious)
                        avg_iou = np.mean(ious)

                        summary = tf.Summary()
                        summary.value.add(
                            tag='avg_iou', simple_value=avg_iou)

                        # Add per-class IoUs to summary to be viewable in Tensorboard
                        for class_i, class_label in enumerate(dataset.get_learnable_classes_strings()[:dataset.get_empty_class()]):
                            summary.value.add(
                                tag=class_label + '_iou', simple_value=ious[class_i])

                        summary_writers[s].add_summary(summary, sample_i)
                        summary_writers[s].flush()
                        confusion_matrix_clear_op.eval()

            coord.request_stop()
            coord.join([loader_thread])

            print 'Training complete.'