def reset_session(args): hem.message('Resetting variables...') sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) hem.message('Restoring checkpoint...') latest = tf.train.latest_checkpoint(args.dir) print(latest) saver.restore(sess, latest)
def calculate_metrics(dataset_name, dataset_handle, n_batches): reset_session(args) hem.message('Calculating metrics for {} set...'.format(dataset_name)) # accumulate results results = sess.run([m1, m2, y, y2], feed_dict={handle_placeholder: dataset_handle}) g_metrics = Counter(results[0]) + Counter(results[1]) mean_image = np.concatenate((results[2], results[3]), axis=0) for i in range(n_batches - 1): results = sess.run([m1, m2, y, y2], feed_dict={handle_placeholder: dataset_handle}) g_metrics = g_metrics + Counter(results[0]) + Counter(results[1]) mean_image = np.concatenate((mean_image, results[2], results[3]), axis=0) # average metrics n = n_batches * 2 hem.message('Model metrics:') for k in ['t1', 't2', 't3', 'abs_rel_diff', 'squared_rel_diff', 'linear_rmse', 'log_rmse', 'scale_invariant_log_rmse']: print('\t{}: {:.3f}'.format(k, g_metrics[k]/n)) # process mean image mean_image = np.mean(mean_image, axis=0) mean_depth, mean_depth_colorized = colorize_depthmap(mean_image) # print('IMAGE:', os.path.join(args.dir, 'metrics', 'test_mean.png')) cv2.imwrite(os.path.join(args.dir, 'metrics', '{}_mean.png'.format(dataset_name)), mean_depth) cv2.imwrite(os.path.join(args.dir, 'metrics', '{}_mean_colorized.png'.format(dataset_name)), mean_depth_colorized) # calculate metrics using mean dataset depth mean_image_batch = np.stack([mean_image]*args.batch_size, axis=0) results = sess.run([m_mean_1, m_mean_2], feed_dict={handle_placeholder: dataset_handle, mean_image_placeholder: mean_image_batch}) mean_metrics = Counter(results[0]) + Counter(results[1]) for i in range(n_batches - 1): results = sess.run([m_mean_1, m_mean_2], feed_dict={handle_placeholder: dataset_handle, mean_image_placeholder: mean_image_batch}) mean_metrics = mean_metrics + Counter(results[0]) + Counter(results[1]) hem.message('Mean metrics:') for k in ['t1', 't2', 't3', 'abs_rel_diff', 'squared_rel_diff', 'linear_rmse', 'log_rmse', 'scale_invariant_log_rmse']: print('\t{}: {:.3f}'.format(k, mean_metrics[k]/n)) # calculate metrics using g = 0 results = sess.run([m_g0_1, m_g0_2], feed_dict={handle_placeholder: dataset_handle, mean_image_placeholder: zero_image_batch}) # print(results) zero_metrics = Counter(results[0]) + Counter(results[1]) for i in range(n_batches - 1): results = sess.run([m_g0_1, m_g0_2], feed_dict={handle_placeholder: dataset_handle, mean_image_placeholder: zero_image_batch}) # print(results) zero_metrics = zero_metrics + Counter(results[0]) + Counter(results[1]) hem.message('Zero metrics:') for k in ['t1', 't2', 't3', 'abs_rel_diff', 'squared_rel_diff', 'linear_rmse', 'log_rmse', 'scale_invariant_log_rmse']: print('\t{}: {:.3f}'.format(k, zero_metrics[k]/n))
y_hat = g # y_0 = g_0 return x, y, g, y_hat, y_bar def cgan_mean_nodes(tower=0): x = graph.as_graph_element('tower_{}/input_preprocess/Reshape'.format(tower)).outputs[0] y = graph.as_graph_element('tower_{}/input_preprocess/Reshape_1'.format(tower)).outputs[0] y_bar = graph.as_graph_element('tower_{}/input_preprocess/Mean'.format(tower)).outputs[0] # g_0 = graph.as_graph_element('tower_{}/generator/zeros_like'.format(tower)).outputs[0] g = graph.as_graph_element('tower_{}/generator/decoder/transpose_1'.format(tower)).outputs[0] y_hat = graph.as_graph_element('tower_{}/generator/add'.format(tower)).outputs[0] # y_0 = graph.as_graph_element('tower_{}/generator/add_1'.format(tower)).outputs[0] return x, y, g, y_hat, y_bar hem.message('Parsing arguments...') args = hem.parse_args() hem.message('Loading metafile and graph data...') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) saver = tf.train.import_meta_graph(os.path.join(args.dir, 'checkpoint-50.meta')) graph = tf.get_default_graph() hem.message('Loading dataset...') x, handle, iterators = hem.get_dataset_tensors(args) sess.run(iterators['train']['x'].initializer) sess.run(iterators['validate']['x'].initializer) train_handle = sess.run(iterators['train']['handle']) validate_handle = sess.run(iterators['validate']['handle']) handle_placeholder = graph.as_graph_element('input_pipeline/Placeholder').outputs[0] # mean_image_placeholder = graph.as_graph_element('Placeholder').outputs[0]
def parse_args(display=False): # parse command line arguments ###################################################################### parser = hem.CustomArgumentParser( description='Autoencoder training harness.', formatter_class=argparse.ArgumentDefaultsHelpFormatter, fromfile_prefix_chars='@', conflict_handler='resolve', epilog="""Example: python train.py @path/to/config_file --dir workspace/model_test --lr 0.1""") parser._action_groups.pop() data_args = parser.add_argument_group('Data') optimizer_args = parser.add_argument_group('Optimizer') train_args = parser.add_argument_group('Training') misc_args = parser.add_argument_group('Miscellaneous') # TODO add support for specifying additional directories for data and model plugins # misc settings add = misc_args.add_argument add('--seed', type=int, help="Useful for debugging. Randomized each execution if not set.") add('--n_gpus', type=int, default=1, help="""Number of GPUs to use for simultaneous training. Model will be duplicated on each device and results averaged on CPU.""") add('--profile', default=False, action='store_true', help="""Enables runtime metadata collection during training that is viewable in TensorBoard.""") add('--check_numerics', default=False, action='store_true', help= """Enables numeric checks for nan/inf in gradients for more detailed error reporting.""") add('--model', type=lambda s: s.lower(), default='fc', help="Name of model to train.") # training settings add = train_args.add_argument add('--epochs', default='3', help="""Number of epochs to train for during this run. Use an integer to denote the max number of epochs to train for, or `+n` for an additional n epochs from a saved checkpoint.""") add('--batch_size', type=int, default=256, help="Batch size to use, per device.") add('--epoch_size', type=int, default=-1, help="""Number of iterations to use per epoch. Defaults to using the entire dataset.""") add('--dir', type=str, default='workspace/{}'.format(uuid.uuid4()), help= """Location to store checkpoints, logs, etc. If this location is populated by a previous run then training will be continued from last checkpoint.""" ) add('--max_to_keep', type=int, default=0, help= """Max (most recent) number of saved sessions to keep, once per epoch. Set to 0 to keep every one.""") add('--test_epochs', nargs='*', default=[], type=int, help= """List of epochs where the model should be run against the Test dataset. Leave blank to run at the end of training (--epochs argument).""" ) # optimizer settings add = optimizer_args.add_argument add('--optimizer', type=lambda s: s.lower(), default='rmsprop', help="Optimizer to use during training.") add('--lr', type=float, default=0.001, help="Learning rate of optimizer (if supported).") add('--loss', type=lambda s: s.lower(), default='l1', help="Loss function used by model during training (if supported).") add('--momentum', type=float, default=0.01, help="Momentum value used by optimizer (if supported).") add('--decay', type=float, default=0.9, help="Decay value used by optimizer (if supported).") add('--centered', default=False, action='store_true', help="Enables centering in RMSProp optimizer.") add('--beta1', type=float, default=0.9, help="Value for optimizer's beta_1 (if supported).") add('--beta2', type=float, default=0.999, help="Value for optimizer's beta_2 (if supported).") # data/pipeline settings add = data_args.add_argument add('--dataset', type=lambda s: s.lower(), default='floorplan', help="Name of dataset to use.") add('--shuffle', default=True, action='store_true', help="""Set this to shuffle the dataset every epoch.""") add('--buffer_size', type=int, default=10000, help="""Size of the data buffer.""") add('--cache_dir', default=None, help="""Cache dataset to the directory specified. If not provided, will attempt to cache to memory.""") add('--raw_dataset_dir', default='/tmp', help="Location of raw dataset files, if needed") add('--dataset_dir', default='datasets', help="Location of prepared tfrecord files for the requested dataset.") add('--n_threads', type=int, default=multiprocessing.cpu_count(), help="""Number of threads to use for processing datasets.""") # parse main/general arguments args, leftover_args = parser.parse_known_args() # parse dataset-specific arguments for k, v in hem.get_dataset(args.dataset).arguments().items(): parser.add_argument(k, **v) args, leftover_args = parser.parse_known_args(leftover_args, namespace=args) # parse model-specific arguments model = hem.get_model(args.model) for k, v in model.arguments().items(): parser.add_argument(k, **v) args, leftover_args = parser.parse_known_args(leftover_args, namespace=args) if len(leftover_args) > 0: hem.message( 'WARNING: unknown and unused arguments provided: {}'.format( leftover_args), format_style=hem.WARNING) # set seed (useful for debugging purposes) if args.seed is None: args.seed = os.urandom(4) random.seed(args.seed) if display: for a in vars(args): v = getattr(args, a) print(' {} = {}'.format(a, v)) return args
def train(model, iterators, handle, sv, args, reset=False): try: checkpoint_path = os.path.join(args.dir, 'checkpoint') losses = hem.collection_to_dict(tf.get_collection('losses')) with sv.sv.managed_session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: # summary_train_writer.add_graph(sess.graph, global_step=global_step) # initialize start_time = time.time() if reset: sess.run(sv.reset_global_step) sess.run(sv.reset_global_epoch) current_step = int(sess.run(sv.global_step)) current_epoch = int(sess.run(sv.global_epoch)) # set max epochs based on +n or n format max_epochs = current_epoch + int( args.epochs[1:]) if '+' in args.epochs else int(args.epochs) # initialize datasets for k, v in iterators.items(): sess.run(iterators[k]['x'].initializer) # get handles for datasets training_handle = sess.run(iterators['train']['handle']) validation_handle = sess.run(iterators['validate']['handle']) if 'test' in iterators and iterators['test']['handle'] is not None: test_handle = sess.run(iterators['test']['handle']) # save model params before any training has been done if current_step == 0: hem.message('Generating baseline summaries and checkpoint...') sv.sv.saver.save(sess, save_path=checkpoint_path, global_step=sv.global_step) sv.summary_writers['train'].add_summary( sess.run(sv.summary_op, feed_dict={handle: validation_handle}), global_step=sess.run(sv.global_step)) hem.message('Starting training...') for epoch in range(current_epoch, max_epochs): prog_bar = tqdm(range(iterators['train']['batches']), desc='Epoch {:3d}'.format(epoch + 1), unit='batch') running_total = None for i in prog_bar: # train and display status status = model.train(sess, args, {handle: training_handle}) hem.update_moving_average(status, running_total, prog_bar) # record 10 extra summaries (per epoch) in the first 3 epochs if epoch < 3 and i % int( (iterators['train']['batches'] / 10)) == 0: sv.summary_writers['train'].add_summary( sess.run(sv.summary_op, feed_dict={handle: training_handle}), global_step=sess.run(sv.global_step)) elif epoch >= 3 and i % int( (iterators['train']['batches'] / 3)) == 0: sv.summary_writers['train'].add_summary( sess.run(sv.summary_op, feed_dict={handle: training_handle}), global_step=sess.run(sv.global_step)) sess.run(sv.increment_global_step) # print('global step:', sess.run(sv.global_step)) # update epoch count sess.run(sv.increment_global_epoch) current_epoch = int(sess.run(sv.global_epoch)) # generate end-of-epoch summaries sv.summary_writers['train'].add_summary( sess.run(sv.summary_op, feed_dict={handle: training_handle}), global_step=sess.run(sv.global_step)) # save checkpoint sv.sv.saver.save(sess, save_path=checkpoint_path, global_step=sv.global_epoch) # perform validation hem.inference(sess, losses, sv.summary_op, iterators['validate']['batches'], handle, validation_handle, 'Validation', sv.summary_writers['validate'], sv.global_step) # perform testing, if asked if (epoch + 1) in args.test_epochs: hem.inference(sess, losses, iterators['test']['batches'], handle, test_handle, 'Test', sv.summary_writers['test'], sv.global_step) hem.message('\nTraining complete! Elapsed time: {}s'.format( int(time.time() - start_time))) except Exception as e: print('Caught unexpected exception during training:', e, e.message) sys.exit(-1)
# vargs = {'dir': 'hi', # 'epochs': 100, # 'batch_size': 512, # 'epoch_size': -1, # 'max_to_keep': 0, # 'n_gpus': 2, # 'n_threads': 6, # } # TODO Use tf.tile to duplicate dataset into two branches, one for estimator and one for GAN # TODO # 1. argument parsing only parses one model and discards the remaining ones # 2. hem.message('Welcome to Hem') hem.message('Initializing...') args = hem.parse_args(display=True) hem.init_working_dir(args) vargs = vars(args) hem.message('Initializing dataset...') x, handle, iterators = hem.get_dataset_tensors(args) hem.message('Initializing model...') estimator_model = hem.get_model('mean_depth_estimator')(x, args) vargs['g_arch'] = 'E2' vargs['d_arch'] = 'E2' sampler_model = hem.get_model('experimental_sampler')(x, estimator_model, args)
def process_example(scene, frame, g, y_hat, args, x_stride=10, y_stride=10, save_images=True): path = '/mnt/research/datasets/nyuv2/preprocessed/' + scene + '/' + frame # path = os.path.join('/mnt/research/datasets/nyuv2/preprocessed/', scene, frame) # read in originals i, d = read_originals(path) # i, d = read_originals('/mnt/research/datasets/nyuv2/preprocessed/kitchen_0025/scene_281') name = scene + "_" + frame original_image, original_depth = write_to_disk(i, d, name, args, save_images) # build up the batches to feed in hem.message('building patches...') image_batch = build_batch(i, x_stride=x_stride, y_stride=y_stride) depth_batch = build_batch(d, x_stride=x_stride, y_stride=y_stride, channels=1) hem.message('generating results...') g_results = forward_inference(g, image_batch, depth_batch) y_hat_results = forward_inference(y_hat, image_batch, depth_batch) # g_results, y_hat_results = forward_inference(g, y_hat, image_batch, depth_batch) reconstructed_image_g, reconstructed_depth_g = reconstruct( image_batch, g_results, x_stride=x_stride, y_stride=y_stride) reconstructed_image_y_hat, reconstructed_depth_y_hat = reconstruct( image_batch, y_hat_results, x_stride=x_stride, y_stride=y_stride) # reconstructed image reconstructed_image = reconstructed_image_g * 255.0 if save_images: cv2.imwrite( os.path.join(args.dir, 'images', name + '_reconstructed_image.png'), reconstructed_image) # variance map reconstructed_var = (reconstructed_depth_g - reconstructed_depth_g.min() ) / (reconstructed_depth_g.max() - reconstructed_depth_g.min()) * 10.0 reconstructed_var = reconstructed_var / 10.0 * 255.0 reconstructed_var = reconstructed_var.astype(np.uint8) # reconstructed_var = cv2.applyColorMap(reconstructed_var, cv2.COLORMAP_BONE) if save_images: cv2.imwrite( os.path.join(args.dir, 'images', name + '_reconstructed_variance.png'), reconstructed_var) # depth map reconstructed_depth = reconstructed_depth_y_hat / 10.0 * 255.0 reconstructed_depth = reconstructed_depth.astype(np.uint8) reconstructed_depth = cv2.applyColorMap(reconstructed_depth, cv2.COLORMAP_JET) if save_images: cv2.imwrite( os.path.join(args.dir, 'images', name + '_reconstructed_depth.png'), reconstructed_depth) reconstructed_var = np.concatenate( (reconstructed_var, reconstructed_var, reconstructed_var), axis=2) montage = np.concatenate((original_image, original_depth, reconstructed_depth, reconstructed_var), axis=1) if save_images: cv2.imwrite(os.path.join(args.dir, 'images', name + '_montage.png'), montage) # calculate rmse print('\trmse for {}/{}:'.format(scene, frame), rmse(d, reconstructed_depth_y_hat)) return montage
# reconstructed_depth = reconstructed_depth.astype(np.uint8) # reconstructed_depth = cv2.applyColorMap(reconstructed_depth, cv2.COLORMAP_JET) # if save_images: # cv2.imwrite(os.path.join(args.dir, 'images', name + '_reconstructed_depth.png'), reconstructed_depth) # reconstructed_var = np.concatenate((reconstructed_var, reconstructed_var, reconstructed_var), axis=2) # montage = np.concatenate((original_image, original_depth, reconstructed_depth, reconstructed_var), axis=1) # if save_images: # cv2.imwrite(os.path.join(args.dir, 'images', name + '_montage.png'), montage) # # calculate rmse # print('\trmse for {}/{}:'.format(scene, frame), rmse(d, reconstructed_depth_y_hat)) # # return montage if __name__ == '__main__': hem.message('Parsing arguments...') args = hem.parse_args() hem.message('Loading metafile and graph data...') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) x_ph = tf.placeholder(tf.float32, (512, 3, 65, 65)) y_ph = tf.placeholder(tf.float32, (512, 1, 65, 65)) # load graph, but replace input tensors with placeholders for feeding checkpoint_num = 50 saver = tf.train.import_meta_graph( os.path.join(args.dir, 'checkpoint-{}.meta'.format(checkpoint_num)), input_map={ "tower_0/input_preprocess/tower_0_x:0": x_ph, "tower_0/input_preprocess/tower_0_y:0": y_ph })