def main(): if config.gpu and not torch.cuda.is_available(): raise ValueError("GPU not supported or enabled on this system.") use_gpu = config.gpu log.info("Loading train dataset") train_dataset = COVIDxFolder( config.train_imgs, config.train_labels, transforms.train_transforms(config.width, config.height)) train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, drop_last=True, num_workers=config.n_threads, pin_memory=use_gpu) log.info("Number of training examples {}".format(len(train_dataset))) log.info("Loading val dataset") val_dataset = COVIDxFolder( config.val_imgs, config.val_labels, transforms.val_transforms(config.width, config.height)) val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=config.n_threads, pin_memory=use_gpu) log.info("Number of validation examples {}".format(len(val_dataset))) if config.weights: #state = torch.load(config.weights) state = None log.info("Loaded model weights from: {}".format(config.weights)) else: state = None state_dict = state["state_dict"] if state else None model = architecture.COVIDNext50(n_classes=config.n_classes) if state_dict: model = util.load_model_weights(model=model, state_dict=state_dict) if use_gpu: model.cuda() model = torch.nn.DataParallel(model) optim_layers = filter(lambda p: p.requires_grad, model.parameters()) # optimizer and lr scheduler optimizer = Adam(optim_layers, lr=config.lr, weight_decay=config.weight_decay) scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=config.lr_reduce_factor, patience=config.lr_reduce_patience, mode='max', min_lr=1e-7) # Load the last global_step from the checkpoint if existing global_step = 0 if state is None else state['global_step'] + 1 class_weights = util.to_device(torch.FloatTensor(config.loss_weights), gpu=use_gpu) loss_fn = CrossEntropyLoss() # Reset the best metric score best_score = -1 for epoch in range(config.epochs): log.info("Started epoch {}/{}".format(epoch + 1, config.epochs)) for data in train_loader: imgs, labels = data imgs = util.to_device(imgs, gpu=use_gpu) labels = util.to_device(labels, gpu=use_gpu) logits = model(imgs) loss = loss_fn(logits, labels) optimizer.zero_grad() loss.backward() optimizer.step() if global_step % config.log_steps == 0 and global_step > 0: probs = model.module.probability(logits) preds = torch.argmax(probs, dim=1).detach().cpu().numpy() labels = labels.cpu().detach().numpy() acc, f1, _, _ = util.clf_metrics(preds, labels) lr = util.get_learning_rate(optimizer) log.info("Step {} | TRAINING batch: Loss {:.4f} | F1 {:.4f} | " "Accuracy {:.4f} | LR {:.2e}".format( global_step, loss.item(), f1, acc, lr)) if global_step % config.eval_steps == 0 and global_step > 0: best_score = validate(val_loader, model, best_score=best_score, global_step=global_step, cfg=config) scheduler.step(best_score) global_step += 1
def train_baselines(): train_data, val_data = get_dataloader(96) for model, batch in zip(models, batch_size): name = str(model).split()[1] print('*****Start Training {} with batch size {}******'.format( name, batch)) print( ' epoch iter rate | smooth_loss | train_loss (acc) | valid_loss (acc) | total_train_loss\n' ) logger = Logger('../log/{}'.format(name), name) net = model(pretrained=True) optimizer = get_optimizer(net, lr=.01, pretrained=True, resnet=True if 'resnet' in name else False) net = nn.DataParallel(net.cuda()) train_data.batch_size = batch val_data.batch_size = batch num_epoches = 50 #100 print_every_iter = 20 epoch_test = 1 # optimizer # optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0005) # optimizer = optim.Adam(net.parameters(), lr=1e-4, weight_decay=5e-4) smooth_loss = 0.0 train_loss = np.nan train_acc = np.nan # test_loss = np.nan best_test_loss = np.inf # test_acc = np.nan t = time.time() for epoch in range( num_epoches): # loop over the dataset multiple times # train loss averaged every epoch total_epoch_loss = 0.0 lr_schedule(epoch, optimizer, pretrained=True) rate = get_learning_rate(optimizer)[0] # check sum_smooth_loss = 0.0 total_sum = 0 sum = 0 net.cuda().train() num_its = len(train_data) for it, (images, labels, indices) in enumerate(train_data, 0): logits = net(Variable(images.cuda())) probs = F.sigmoid(logits) loss = multi_criterion(logits, labels.cuda()) optimizer.zero_grad() loss.backward() optimizer.step() # additional metrics sum_smooth_loss += loss.data[0] total_epoch_loss += loss.data[0] sum += 1 total_sum += 1 # print statistics if it % print_every_iter == print_every_iter - 1: smooth_loss = sum_smooth_loss / sum sum_smooth_loss = 0.0 sum = 0 train_acc = multi_f_measure(probs.data, labels.cuda()) train_loss = loss.data[0] print('\r{} {} {} | {} | {} {} | ... '.format( epoch + it / num_its, it + 1, rate, smooth_loss, train_loss, train_acc), end='', flush=True) total_epoch_loss = total_epoch_loss / total_sum if epoch % epoch_test == epoch_test - 1 or epoch == num_epoches - 1: net.cuda().eval() test_loss, test_acc = evaluate(net, val_data) print('\r', end='', flush=True) print('{} {} {} | {} | {} {} | {} {} | {}'.format( epoch + 1, it + 1, rate, smooth_loss, train_loss, train_acc, test_loss, test_acc, total_epoch_loss)) # save if the current loss is better if test_loss < best_test_loss: print('save {} {}'.format(test_loss, best_test_loss)) torch.save(net.state_dict(), '../models/{}.pth'.format(name)) net.load_state_dict( torch.load('../models/{}.pth'.format(name))) print(evaluate(net, val_data)) best_test_loss = test_loss logger.add_record('train_loss', total_epoch_loss) logger.add_record('evaluation_loss', test_loss) logger.add_record('f2_score', test_acc) logger.save() logger.save_plot() logger.save_time(start_time=t, end_time=time.time())
def train_baselines(): train_data, val_data = get_dataloader(96) for model, batch in zip(models, batch_size): name = str(model).split()[1] print('*****Start Training {} with batch size {}******'.format( name, batch)) print( ' epoch iter rate | smooth_loss | train_loss (acc) | valid_loss (acc) | total_train_loss\n' ) logger = Logger( '/mnt/home/dunan/Learn/Kaggle/planet_amazon/log/full_data_{}_10xlr_2' .format(name), name) # load pre-trained model on train-37479 net = model(pretrained=True) net = nn.DataParallel(net.cuda()) # load_net(net, name) # optimizer = get_optimizer(net.module, lr=.005, pretrained=True, resnet=True if 'resnet' in name else False) optimizer = get_optimizer(net.module, lr=.01, pretrained=True, resnet=True if 'resnet' in name else False) train_data.batch_size = batch val_data.batch_size = batch num_epoches = 60 print_every_iter = 20 epoch_test = 1 smooth_loss = 0.0 train_loss = np.nan train_acc = np.nan best_test_loss = np.inf t = time.time() for epoch in range( num_epoches): # loop over the dataset multiple times # train loss averaged every epoch total_epoch_loss = 0.0 # lr_schedule(epoch, optimizer, base_lr=0.005, pretrained=True) new_lr_schedule(epoch, optimizer) rate = get_learning_rate(optimizer)[0] # check sum_smooth_loss = 0.0 total_sum = 0 sum = 0 net.cuda().train() num_its = len(train_data) for it, (images, labels, indices) in enumerate(train_data, 0): logits = net(Variable(images.cuda())) probs = F.sigmoid(logits) loss = multi_criterion(logits, labels.cuda()) optimizer.zero_grad() loss.backward() optimizer.step() # additional metrics sum_smooth_loss += loss.data[0] total_epoch_loss += loss.data[0] sum += 1 total_sum += 1 # print statistics if it % print_every_iter == print_every_iter - 1: smooth_loss = sum_smooth_loss / sum sum_smooth_loss = 0.0 sum = 0 train_acc = multi_f_measure(probs.data, labels.cuda()) train_loss = loss.data[0] print( '\r{} {} {} | {} | {} {} | ... '.format( epoch + it / num_its, it + 1, rate, smooth_loss, train_loss, train_acc), ) total_epoch_loss = total_epoch_loss / total_sum if epoch % epoch_test == epoch_test - 1 or epoch == num_epoches - 1: net.cuda().eval() test_loss, test_acc = evaluate(net, val_data) print('\r') print('{} {} {} | {} | {} {} | {} {} | {}'.format( epoch + 1, it + 1, rate, smooth_loss, train_loss, train_acc, test_loss, test_acc, total_epoch_loss)) # save if the current loss is better if test_loss < best_test_loss: print('save {} {}'.format(test_loss, best_test_loss)) torch.save( net.state_dict(), '/mnt/home/dunan/Learn/Kaggle/planet_amazon/model/full_data_{}_10xlr_2.pth' .format(name)) best_test_loss = test_loss logger.add_record('train_loss', total_epoch_loss) logger.add_record('evaluation_loss', test_loss) logger.add_record('f2_score', test_acc) logger.save() logger.save_plot() logger.save_time(start_time=t, end_time=time.time())
def train(config_path): """ Trains a model for a maximum of config.max_epochs epochs Args: config_path: string, path to a config.json file """ # Load configuration if not os.path.exists(config_path): print 'Error: No configuration file present at specified path.' return config = util.load_config(config_path) print 'Loaded configuration from: %s' % config_path # Create session directory if 'session_dir' not in config['training'] or os.path.exists(config['training']['session_dir']): create_new_session(config) # Direct all output to screen and log file util.set_print_to_screen_and_file( os.path.join(config['training']['session_dir'], 'session.log')) model = fcpn.FCPN(config) dataset = data.Dataset(config) dataset.prepare(config['dataset']['refresh_cache']) config['model']['pointnet']['num'] = np.prod(model.get_feature_volume_shape( config['dataset']['training_samples']['spatial_size'], config['model']['pointnet']['spacing'], 1)) enqueue_op, queue_placeholders, queue_batch_placeholders, get_queue_size_op = setup_queue( config['dataset']['training_samples']['num_points'] + config['model']['pointnet']['num'], dataset.get_num_output_voxels(), config['training']['batch_size']) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = config['training']['gpu']['allow_growth'] tf_config.allow_soft_placement = config['training']['gpu']['allow_soft_placement'] sess = tf.Session(config=tf_config) with sess.as_default(): with tf.device('/gpu:' + str(config['training']['gpu']['id'])): # Batch normalization batch_i = tf.Variable(0, name='batch_i') batch_normalization_decay = util.get_batch_normalization_decay( batch_i, config['training']['batch_size'], config['training']['optimizer']['batch_normalization']['initial_decay'], config['training']['optimizer']['batch_normalization']['decay_rate'], config['training']['optimizer']['batch_normalization']['decay_step']) tf.summary.scalar('batch_normalization_decay', batch_normalization_decay) is_training_pl = tf.placeholder(tf.bool, shape=()) # Build model pred_op = model.build_model(config['training']['batch_size'], config['dataset']['training_samples']['spatial_size'], queue_batch_placeholders['input_points_pl'], queue_batch_placeholders['input_features_pl'], is_training_pl, dataset.get_num_learnable_classes(), batch_normalization_decay) # Loss loss_op = model.get_loss( pred_op, queue_batch_placeholders['output_voxels_pl'], queue_batch_placeholders['output_voxel_weights_pl']) model.print_num_parameters() model.print_layer_weights() # Confusion matrix confusion_matrix_op, confusion_matrix_update_op, confusion_matrix_clear_op = model.get_confusion_matrix_ops( pred_op, queue_batch_placeholders['output_voxels_pl'], dataset.get_num_learnable_classes(), dataset.get_empty_class()) # Optimizer learning_rate_op = util.get_learning_rate( batch_i, config['training']['batch_size'], config['training']['optimizer']['learning_rate']['initial'], config['training']['optimizer']['learning_rate']['decay_rate'], config['training']['optimizer']['learning_rate']['decay_step']) tf.summary.scalar('learning_rate', learning_rate_op) optimizer_op = tf.train.AdamOptimizer(learning_rate_op) if config['training']['train_upsampling_only']: upsampling_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "upsampling") optimization_op = optimizer_op.minimize(loss_op, var_list=upsampling_weights, global_step=batch_i) else: optimization_op = optimizer_op.minimize(loss_op, global_step=batch_i) # Summary and Saving saver = tf.train.Saver(max_to_keep=config['training']['checkpoints_to_keep']) merged_summary_op = tf.summary.merge_all() summary_writers = { 'train': tf.summary.FileWriter(os.path.join(config['training']['session_dir'], 'train'), sess.graph), 'val': tf.summary.FileWriter(os.path.join(config['training']['session_dir'], 'val')) } # Initialize variables in graph init_g = tf.global_variables_initializer() init_l = tf.local_variables_initializer() sess.run([init_g, init_l], {is_training_pl: True}) # Restore model weights from disk if config['training']['checkpoint_path']: weights_to_be_restored = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # If finetuning on a new dataset, don't load last layer weights or confusion matrix if config['training']['finetune_new_classes']: final_layer_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="upsampling/15cm_to_5cm/final_conv") confusion_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="confusion") weights_to_be_restored = list(set(weights_to_be_restored) - set(final_layer_weights) - set(confusion_variables)) restorer = tf.train.Saver(var_list=weights_to_be_restored) restorer.restore(sess, config['training']['checkpoint_path']) print 'Model weights restored from checkpoint file: %s' % config['training']['checkpoint_path'] num_batches = { 'train': dataset.get_num_batches('train', config['training']['batch_size']), 'val': dataset.get_num_batches('val', config['training']['batch_size']) } ops = { 'train': [loss_op, merged_summary_op, optimization_op], 'val': [loss_op, merged_summary_op, confusion_matrix_update_op] } # Start loading samples into FIFO queue coord, loader_thread = start_data_loader( sess, enqueue_op, queue_placeholders, model, dataset, config) # Save configuration file (with derived parameters) to session directory util.save_config(os.path.join(config['training']['session_dir'], 'config.json'), config) # Start training sample_i = 0 for epoch_i in range(config['training']['max_epochs']): print '\nEpoch: %d' % epoch_i for s in ['train', 'val']: is_training = (s == 'train') if s == 'train': is_training = True print 'Training set\nBatch/Total Batches | Loss | Items in Queue' else: print 'Validation set\nBatch/Total Batches | Loss | Items in Queue' for epoch_batch_i in range(num_batches[s]): loss, summary, _ = sess.run( ops[s], feed_dict={is_training_pl: is_training}) # Log statistics if epoch_batch_i % config['training']['log_every_n_batches'] == 0: summary_writers[s].add_summary(summary, sample_i) summary_writers[s].flush() print '%i/%i | %f | %d' % (epoch_batch_i + 1, num_batches[s], loss, get_queue_size_op.eval()) # Only do when in training phase if s == 'train': sample_i += config['training']['batch_size'] # Save snapshot of model if epoch_batch_i % config['training']['save_every_n_batches'] == 0: save_path = saver.save(sess, os.path.join( config['training']['session_dir'], "model.ckpt"), global_step=epoch_i) print 'Checkpoint saved at batch %d to %s' % ( epoch_batch_i, save_path) # Only do at the end of the validation phase if s == 'train': save_path = saver.save(sess, os.path.join( config['training']['session_dir'], "model.ckpt"), global_step=epoch_i) print 'Checkpoint saved at batch %d to %s' % (epoch_batch_i, save_path) elif s == 'val': confusion_matrix = confusion_matrix_op.eval() # Compute and print per-class statistics true_positives, false_negatives, false_positives, ious = util.compute_per_class_statistics(confusion_matrix[:dataset.get_empty_class(),:dataset.get_empty_class()]) util.pretty_print_confusion_matrix(confusion_matrix, dataset.get_learnable_classes_strings()) util.pretty_print_per_class_statistics(dataset.get_learnable_classes_strings()[:dataset.get_empty_class()], true_positives, false_negatives, false_positives, ious) avg_iou = np.mean(ious) summary = tf.Summary() summary.value.add( tag='avg_iou', simple_value=avg_iou) # Add per-class IoUs to summary to be viewable in Tensorboard for class_i, class_label in enumerate(dataset.get_learnable_classes_strings()[:dataset.get_empty_class()]): summary.value.add( tag=class_label + '_iou', simple_value=ious[class_i]) summary_writers[s].add_summary(summary, sample_i) summary_writers[s].flush() confusion_matrix_clear_op.eval() coord.request_stop() coord.join([loader_thread]) print 'Training complete.'