def main(args): utils.set_seed_everywhere(args.seed) cfg = hyperparameters.get_config(args) cfg.seed = args.seed args.cuda = not args.no_cuda and torch.cuda.is_available() time_str = datetime.now( timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S") exp_dir = os.path.join(cfg.base_dir, time_str) checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir) log_dir = os.path.join(exp_dir, cfg.log_dir) save_config(cfg, exp_dir, "config.json") print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir) num_timsteps = cfg.observed_steps + cfg.predicted_steps data_shape = {'image': (None, num_timsteps, 3, 64, 64)} cfg.data_shapes = data_shape model = KeypointModel(cfg) cp_callback = ModelCheckpoint(filepath=os.path.join( checkpoint_dir, "model_"), period=25, save_top_k=-1) logger = TensorBoardLogger(log_dir, name="", version=None) gpus = 1 if args.cuda else None if args.pretrained_path: checkpoint_path = get_latest_checkpoint(args.pretrained_path) import json model = KeypointModel.load_from_checkpoint(checkpoint_path) print(json.dumps(model.cfg, indent=4)) print("On GPU Device: ", gpus) trainer = Trainer( max_epochs=args.num_epochs, logger=logger, checkpoint_callback=cp_callback, gpus=gpus, #distributed_backend='dp', progress_bar_refresh_rate=1, #gradient_clip_val=cfg.clipnorm, fast_dev_run=False, #train_percent_check=0.1,val_percent_check=0.0, #val_percent_check=0.3, track_grad_norm=2, show_progress_bar=True) trainer.fit(model) save_path = os.path.join(checkpoint_dir, "model_final_" + str(args.num_epochs) + ".ckpt") print("Saving model finally:") trainer.save_checkpoint(save_path)
def viz_seq_unroll(args): torch.random.manual_seed(0) np.random.seed(0) cfg = hyperparameters.get_config(args) unroll_T = 16 args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") l_dir = cfg.train_dir if args.is_train else args.test_dir print("Data loader: ", l_dir) loader, data_shapes = datasets.get_sequence_dataset( data_dir=os.path.join(cfg.data_dir, l_dir), batch_size=cfg.batch_size, num_timesteps=cfg.observed_steps + cfg.predicted_steps, shuffle=False) cfg.data_shapes = data_shapes model = train_dynamics.KeypointModel(cfg).to(device) if args.pretrained_path: checkpoint_path = get_latest_checkpoint(args.pretrained_path) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) print("Loading model from: ", checkpoint_path) model.load_state_dict(checkpoint['state_dict']) model.eval() with torch.no_grad(): for data in islice(loader, 1): img_seq = data['image'].to(device) pred_img_seq, pred_keyp_seq = model.unroll(img_seq, unroll_T) bs, T = img_seq.shape[0], img_seq.shape[1] print( "LOSS:", F.mse_loss(img_seq, pred_img_seq[:, :T], reduction='sum') / (bs * T)) print(img_seq.shape, pred_keyp_seq.shape, pred_img_seq.shape) imgs_seq_np, pred_img_seq_np = img_torch_to_numpy( img_seq), img_torch_to_numpy(pred_img_seq) keypoints_seq_np = pred_keyp_seq.cpu().numpy() num_seq = imgs_seq_np.shape[0] for i in islice(range(num_seq), 3): save_path = os.path.join( args.vids_dir, args.vids_path + "_" + l_dir + "_{}.mp4".format(i)) print(i, "Video PRED Save Path", save_path) viz_all_unroll(imgs_seq_np[i], pred_img_seq_np[i], keypoints_seq_np[i], True, 100, save_path)
def run_final_test(args): utils.set_seed_everywhere(args.seed) cfg = hyperparameters.get_config(args) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") l_dir = cfg.train_dir if args.is_train else args.test_dir print("Data loader: ", l_dir) loader, data_shapes = datasets.get_sequence_dataset( data_dir=os.path.join(cfg.data_dir, l_dir), batch_size=cfg.batch_size, num_timesteps=2 * args.timesteps, shuffle=True) cfg.log_training = args.log_training cfg.log_training_path = os.path.join(args.log_training_path) cfg.data_shapes = data_shapes if args.no_first: if args.keyp_pred: print("Loding keyp pred") model = train_keyp_pred.KeypointModel(cfg).to(device) elif args.keyp_inverse: print("Loding Inverse Model") model = train_keyp_inverse.KeypointModel(cfg).to(device) else: pass else: model = train.KeypointModel(cfg).to(device) if args.pretrained_path: if args.ckpt: checkpoint_path = os.path.join( args.pretrained_path, "_ckpt_epoch_" + args.ckpt + ".ckpt") else: print("Loading latest") checkpoint_path = get_latest_checkpoint(args.pretrained_path) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) print("Loading model from: ", checkpoint_path) model.load_state_dict(checkpoint['state_dict']) model.eval() print("Load complete") trainer = Trainer(gpus=1, progress_bar_refresh_rate=1, show_progress_bar=True) trainer.test(model)
def main(args): np.random.seed(args.seed) torch.manual_seed(args.seed) cfg = hyperparameters.get_config(args) args.cuda = not args.no_cuda and torch.cuda.is_available() time_str = datetime.now( timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S") exp_dir = os.path.join(cfg.base_dir, time_str) checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir) log_dir = os.path.join(exp_dir, cfg.log_dir) save_config(cfg, exp_dir, "config.json") print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir) data_shape = {'image': (None, 3, 64, 64)} cfg.data_shapes = data_shape model = KeypointModel(cfg) cp_callback = ModelCheckpoint(filepath=os.path.join( checkpoint_dir, "model_"), period=2, save_top_k=-1) logger = TensorBoardLogger(log_dir, name="", version=None) gpus = 1 if args.cuda else None if args.pretrained_path: checkpoint_path = get_latest_checkpoint(args.pretrained_path) import json model = KeypointModel.load_from_checkpoint(checkpoint_path) print(json.dumps(model.cfg, indent=4)) print("On GPU Device: ", gpus) trainer = Trainer(max_epochs=args.num_epochs, logger=logger, checkpoint_callback=cp_callback, gpus=gpus, progress_bar_refresh_rate=1, gradient_clip_val=cfg.clipnorm, fast_dev_run=False) trainer.fit(model)
def main(args): cfg = hyperparameters.get_config(args) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") l_dir = cfg.train_dir if args.is_train else args.test_dir print("Data loader: ", l_dir) loader, data_shapes = datasets.get_dataset(data_dir=os.path.join( cfg.data_dir, l_dir), batch_size=cfg.batch_size) models = build_model_noseq(cfg, data_shapes).to(device) if args.pretrained_path: model_path = get_latest_checkpoint(args.pretrained_path, "*.pth") print("Loading model from: ", model_path) models.load_state_dict(torch.load(model_path)) models.eval() images_to_keypoints_net, keypoints_to_images_net = models with torch.no_grad(): for i, data in islice(enumerate(loader), 5): img = data['image'].to(device) keypoints, _ = images_to_keypoints_net(img) pred_img = keypoints_to_images_net(keypoints) print(img.shape, keypoints.shape, pred_img.shape) save_path = os.path.join( args.vids_dir, args.vids_path + "_" + l_dir + "_{}.mp4".format(i)) print(i, "Video Save Path", save_path) imgs_np, pred_img_np = img_torch_to_numpy(img), img_torch_to_numpy( pred_img) keypoints_np = keypoints.cpu().numpy() viz_all(imgs_np, pred_img_np, keypoints_np, True, 300, save_path)
def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving Deferred Prediction evaluation results to %s', FLAGS.output_dir) train_seed = FLAGS.train_seed eval_seed = FLAGS.eval_seed if FLAGS.eval_seed is not None else train_seed tf.random.set_seed(eval_seed) if FLAGS.num_cores > 1: raise ValueError('Only a single accelerator is currently supported.') if FLAGS.use_gpu: logging.info('Use GPU') else: logging.info('Use CPU') os.environ['CUDA_VISIBLE_DEVICES'] = '-1' model_type = FLAGS.model_type eval_batch_size = FLAGS.eval_batch_size * FLAGS.num_cores num_mc_samples = FLAGS.num_mc_samples # Deferred Prediction flags deferred_prediction_fractions = sorted(FLAGS.deferred_prediction_fractions) uncertainty_type = FLAGS.uncertainty_type try: uncertainty_estimator_fn = ( deferred_prediction .RETINOPATHY_MODEL_TO_UNCERTAINTY_ESTIMATOR[model_type]) except KeyError: raise NotImplementedError( 'Unsupported model type. Try implementing a wrapper to retrieve ' 'predictive uncertainty, as in deferred_prediction.py.') # Load test set # As per the Kaggle challenge, we have split sizes: # train: 35,126 # validation: 10,906 (currently unused) # test: 42,670 ds_info = tfds.builder('diabetic_retinopathy_detection').info steps_per_test_eval = ds_info.splits['test'].num_examples // eval_batch_size data_dir = FLAGS.data_dir dataset_test_builder = ub.datasets.get( 'diabetic_retinopathy_detection', split='test', data_dir=data_dir) dataset_test = dataset_test_builder.load(batch_size=eval_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) logging.info('Building Keras ResNet-50 %s model.', model_type) # Initialize test metrics # For each type of metric, e.g., AUC, initialize # one aggregator per deferred prediction fraction. metrics = utils.get_diabetic_retinopathy_base_test_metrics( use_tpu=False, num_bins=FLAGS.num_bins, deferred_prediction_fractions=deferred_prediction_fractions) test_metric_fns = utils.get_diabetic_retinopathy_test_metric_fns( use_tpu=False) metrics.update( utils.get_diabetic_retinopathy_cpu_test_metrics( deferred_prediction_fractions=deferred_prediction_fractions)) metrics.update({'test/ms_per_example': tf.keras.metrics.Mean()}) # Load from checkpoint def load_keras_model(checkpoint): model = tf.keras.models.load_model(checkpoint, compile=False) logging.info('Successfully loaded model from checkpoint %s.', checkpoint) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) return model # TODO(nband): debug, switch from keras.models.save to tf.train.Checkpoint checkpoint_filenames = utils.parse_keras_models(FLAGS.checkpoint_dir) if not checkpoint_filenames: raise Exception( f'Did not locate a Keras checkpoint in checkpoint directory ' f'{FLAGS.checkpoint_dir}') # Load in models and wrap, to apply sigmoid on logits, use mixed precision, # and cast to NumPy array for use with Deferred Prediction API. if model_type in {'ensemble', 'dropoutensemble'}: estimator = [] for checkpoint_file in checkpoint_filenames: loaded_model = load_keras_model(checkpoint=checkpoint_file) estimator.append( deferred_prediction.wrap_retinopathy_estimator( loaded_model, use_mixed_precision=FLAGS.use_bfloat16)) else: latest_checkpoint_file = utils.get_latest_checkpoint( file_names=checkpoint_filenames) loaded_model = load_keras_model(checkpoint=latest_checkpoint_file) estimator = deferred_prediction.wrap_retinopathy_estimator( loaded_model, use_mixed_precision=FLAGS.use_bfloat16) # Uncertainty estimation arguments -- dependent on model_type estimator_args = {'uncertainty_type': uncertainty_type} if model_type in { 'dropout', 'radial', 'variational_inference', 'dropoutensemble' }: # Argument for stochastic forward passes estimator_args['num_samples'] = num_mc_samples # Containers used for caching performance evaluation y_true = list() y_pred = list() y_uncertainty = list() test_iterator = iter(dataset_test) for step in range(steps_per_test_eval): if step % 100 == 0: logging.info('Evaluated %d/%d batches.', step, steps_per_test_eval) test_start_time = time.time() inputs = next(test_iterator) # pytype: disable=attribute-error images = inputs['features'] labels = inputs['labels'] # Obtain the predictive mean and uncertainty of the estimator # Training setting = False to disable BatchNorm at evaluation time # We manually enable dropout at evaluation time (as desired) in the # model implementations; # e.g. see `apply_dropout` in models/resnet50_dropout.py. # Sample from probabilistic model batch_mean, batch_uncertainty = uncertainty_estimator_fn( images, estimator, training_setting=False, **estimator_args) # Cache predictions y_true.append(labels) y_pred.append(batch_mean) y_uncertainty.append(batch_uncertainty) ms_per_example = (time.time() - test_start_time) * 1e6 / eval_batch_size metrics['test/ms_per_example'].update_state(ms_per_example) # Use vectorized NumPy containers y_true = np.concatenate(y_true).flatten() y_pred = np.concatenate(y_pred).flatten() y_uncertainty = np.concatenate(y_uncertainty).flatten() # Evaluate and update metrics deferred_prediction.update_metrics_keras( y_true=y_true, y_pred=y_pred, y_uncertainty=y_uncertainty, metrics_dict=metrics, test_metric_fns=test_metric_fns, fractions=deferred_prediction_fractions) # Print evaluation metrics total_results = {name: metric.result() for name, metric in metrics.items()} pprint.pprint(total_results) # Store results as DataFrame, for easy downstream plotting model_results_path = os.path.join(FLAGS.output_dir, model_type) if not tf.io.gfile.isdir(model_results_path): tf.io.gfile.mkdir(model_results_path) parsed_results_dict = deferred_prediction.store_keras_metrics( metrics_dict=metrics, model_type=model_type, model_results_path=model_results_path, train_seed=train_seed, eval_seed=eval_seed, return_parsed_dict=True) # Use parsed results for logging tf.Summary summary_path = os.path.join( model_results_path, f'summaries__trainseed_{train_seed}__evalseed_{eval_seed}') summary_writer = tf.summary.create_file_writer(summary_path) with summary_writer.as_default(): for metric_name in parsed_results_dict.keys(): for retain_proportion, result in parsed_results_dict[metric_name]: # step param only tolerates ints, so we multiply by 100 to give # the retain percentage tf.summary.scalar( metric_name, result, step=tf.constant(int(retain_proportion * 100), dtype=tf.int64)) logging.info('Wrote tf.Summary results to %s.', summary_path)
def viz_seq(args): utils.set_seed_everywhere(args.seed) cfg = hyperparameters.get_config(args) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") l_dir = cfg.train_dir if args.is_train else args.test_dir print("Data loader: ", l_dir) loader, data_shapes = datasets.get_sequence_dataset( data_dir=os.path.join(cfg.data_dir, l_dir), batch_size=10, num_timesteps=args.timesteps, shuffle=True) cfg.data_shapes = data_shapes model = train_keyp_inverse_forward.KeypointModel(cfg).to(device) if args.pretrained_path: if args.ckpt: checkpoint_path = os.path.join( args.pretrained_path, "_ckpt_epoch_" + args.ckpt + ".ckpt") else: print("Loading latest") checkpoint_path = get_latest_checkpoint(args.pretrained_path) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) print("Loading model from: ", checkpoint_path) model.load_state_dict(checkpoint['state_dict']) model.eval() print("Load complete") with torch.no_grad(): for data in islice(loader, 1): img_seq = data['image'].to(device) action_seq = data['action'].to(device) keypoints_seq, heatmaps_seq, pred_keyp_seq, pred_action_seq = model( img_seq, action_seq) print( "Keypoint Pred LOSS:", F.mse_loss(pred_keyp_seq[Ellipsis, :2], keypoints_seq[:, 1:, :, :2], reduction='sum') / ((pred_keyp_seq.shape[0]) * pred_keyp_seq.shape[1])) if args.unroll: pred_keyp_seq = model.unroll(img_seq, action_seq) pred_keyp_seq_np = pred_keyp_seq.cpu().numpy() print(img_seq.shape, keypoints_seq.shape) img_seq_np = img_torch_to_numpy(img_seq) heatmaps_seq_np = heatmaps_seq.permute(0, 1, 3, 4, 2).cpu().numpy() keypoints_seq_np = keypoints_seq.cpu().numpy() d = { 'img': img_seq_np, 'keyp': keypoints_seq_np, 'heatmap': heatmaps_seq.permute(0, 1, 3, 4, 2).cpu().numpy(), 'action': data['action'].cpu().numpy() if 'action' in data else None } tmp_save_path = 'tmp_data/{}_data_{}_seed_{}'.format( l_dir, args.vids_path, args.seed) print("Save intermediate data path: ", tmp_save_path) np.savez(tmp_save_path, **d) num_seq = img_seq_np.shape[0] for i in islice(range(num_seq), 3): save_path = os.path.join( args.vids_dir, args.vids_path + "_" + l_dir + "_{}_seed_{}.mp4".format(i, args.seed)) print(i, "Video Save Path", save_path) viz_keypoints(img_seq_np[i], keypoints_seq_np[i], True, 100, save_path, args.annotate)
elif args.scale == "medium": maxside = 1024 elif args.scale == "big": maxside = 2048 else: maxside = None project = "ade20k" config = utils.get_config(project) im_list = utils.open_im_list(config["im_list"]) datasource = DataSource(config) data_generator = DataGenerator(im_list, datasource, maxside=maxside) # Load checkpoint checkpoint_dir = join("weights", "checkpoints", args.name) if not isdir(checkpoint_dir): makedirs(checkpoint_dir) checkpoint, epoch = (None, 0) if args.resume: checkpoint, epoch = utils.get_latest_checkpoint(checkpoint_dir) sess = tf.Session() K.set_session(sess) with sess.as_default(): print(args) pspnet = PSPNet50(activation=args.activation, checkpoint=checkpoint) train(pspnet, data_generator, checkpoint_dir, initial_epoch=epoch)