Exemple #1
0
def mesh_to_example(codebase_root_dir, mesh_path, dirpath, skip_existing,
                    log_level):
    # Logging level must be specified because mesh_to_example is an entry point
    # for a subprocess call.
    log.set_level(log_level)
    ldif_path = path_util.get_path_to_ldif_root()
    if not skip_existing or not os.path.isfile(
            f'{dirpath}/depth_and_normals.npz'):
        sp.check_output(
            f'{codebase_root_dir}/scripts/process_mesh_local.sh {mesh_path} {dirpath} {ldif_path}',
            shell=True)
        write_depth_and_normals_npz(dirpath,
                                    f'{dirpath}/depth_and_normals.npz')
    else:
        log.verbose(f'Skipping shell script processing for {dirpath},'
                    ' the output already exists.')
    # Precompute the dodeca samples for later:
    e = example.InferenceExample.from_directory(dirpath)
    sample_path = e.precomputed_surface_samples_from_dodeca_path
    if not skip_existing or not os.path.isfile(sample_path):
        e.surface_sample_count = 100000
        precomputed_samples = e.surface_samples_from_dodeca
        assert precomputed_samples.shape[0] == 100000
        assert precomputed_samples.shape[1] == 6
        file_util.write_points(sample_path, precomputed_samples)
    else:
        log.verbose(
            f'Skipping surface sample precompution for {dirpath}, it\'s already done.'
        )
Exemple #2
0
def mesh_to_example(codebase_root_dir, mesh_path, dirpath, skip_existing,
                    log_level):
    # Logging level must be specified because mesh_to_example is an entry point
    # for a subprocess call.
    log.set_level(log_level)
    ldif_path = path_util.get_path_to_ldif_root()
    if not skip_existing or not os.path.isfile(
            f'{dirpath}/depth_and_normals.npz'):
        sp.check_output(
            f'{codebase_root_dir}/scripts/process_mesh_local.sh {mesh_path} {dirpath} {ldif_path}',
            shell=True)
        # write_depth_and_normals_npz(dirpath, f'{dirpath}/depth_and_normals.npz')
    else:
        log.verbose(f'Skipping shell script processing for {dirpath},'
                    ' the output already exists.')
Exemple #3
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    tf.disable_v2_behavior()
    log.set_level(FLAGS.log_level)

    log.info('Making dataset...')
    if not FLAGS.dataset_directory:
        raise ValueError('A dataset directory must be provided.')
    # TODO(kgenova) This batch size should match.
    dataset = local_inputs.make_dataset(FLAGS.dataset_directory,
                                        mode='train',
                                        batch_size=FLAGS.batch_size,
                                        split=FLAGS.split)

    # Sets up the hyperparameters and tf.Dataset
    model_config = build_model_config(dataset)

    # Generates the graph for a single train step, including summaries
    shared_launcher.sif_transcoder(model_config)
    summary_op = tf.summary.merge_all()
    global_step_op = tf.compat.v1.train.get_global_step()

    saver = tf.train.Saver(max_to_keep=5,
                           pad_step_number=False,
                           save_relative_paths=True)

    init_op = tf.initialize_all_variables()

    model_root = get_model_root()

    experiment_dir = f'{model_root}/sif-transcoder-{FLAGS.experiment_name}'
    checkpoint_dir = f'{experiment_dir}/1-hparams/train/'

    if FLAGS.reserve_memory_for_inference_kernel and sys.platform != "darwin":
        current_free = gpu_util.get_free_gpu_memory(0)
        allowable = current_free - (1024 + 512)  # ~1GB
        allowable_fraction = allowable / current_free
        if allowable_fraction <= 0.0:
            raise ValueError(
                f"Can't leave 1GB over for the inference kernel, because"
                f" there is only {allowable} total free GPU memory.")
        log.info(
            f'TensorFlow can use up to {allowable_fraction*100}% of the total'
            ' GPU memory.')
    else:
        allowable_fraction = 1.0
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=allowable_fraction)

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session:
        writer = tf.summary.FileWriter(f'{experiment_dir}/log', session.graph)
        log.info('Initializing variables...')
        session.run([init_op])

        if FLAGS.visualize:
            visualize_data(session, model_config.inputs['dataset'])

        # Check whether the checkpoint directory already exists (resuming) or
        # needs to be created (new model).
        if not os.path.isdir(checkpoint_dir):
            log.info('No previous checkpoint detected, training from scratch.')
            os.makedirs(checkpoint_dir)
            # Serialize hparams so eval can load them:
            hparam_path = f'{checkpoint_dir}/hparam_pickle.txt'
            if not file_util.exists(hparam_path):
                hparams.write_hparams(model_config.hparams, hparam_path)
            initial_index = 0
        else:
            log.info(
                f'Checkpoint root {checkpoint_dir} exists, attempting to resume.'
            )
            latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
            log.info(f'Latest checkpoint: {latest_checkpoint}')
            saver.restore(session, latest_checkpoint)
            initial_index = session.run(global_step_op)
            log.info(f'The global step is {initial_index}')
            initial_index = int(initial_index)
            log.info(f'Parsed to {initial_index}')
        for i in range(initial_index, FLAGS.train_step_count):
            start_time = time.time()
            log.info(f'Step {i}')
            is_summary_step = i % FLAGS.summary_step_interval == 0
            if is_summary_step:
                _, summaries, loss = session.run(
                    [model_config.train_op, summary_op, model_config.loss])
                writer.add_summary(summaries, i)
            else:
                _, loss = session.run(
                    [model_config.train_op, model_config.loss])
            end_time = time.time()
            steps_per_second = 1.0 / (end_time - start_time)
            log.info(f'Loss: {loss}\tSteps/second: {steps_per_second}')

            is_checkpoint_step = i % FLAGS.checkpoint_interval == 0
            if is_checkpoint_step or i == FLAGS.train_step_count - 1:
                ckpt_path = os.path.join(checkpoint_dir, 'model.ckpt')
                log.info(f'Writing checkpoint to {ckpt_path}...')
                saver.save(session, ckpt_path, global_step=i)
        log.info('Done training!')
Exemple #4
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    random.seed(2077)
    log.set_level(FLAGS.log_level)

    n_jobs = os.cpu_count()
    assert FLAGS.max_threads != 0
    if FLAGS.max_threads > 0:
        n_jobs = FLAGS.max_threads

    mesh_directory = FLAGS.mesh_directory
    if mesh_directory[-1] == '/':
        mesh_directory = mesh_directory[:-1]

    files = glob.glob(f'{mesh_directory}/*/*/*.ply')

    if not files and not FLAGS.optimize_only:
        raise ValueError(f"Didn't find any ply files in {mesh_directory}. "
                         "Please make sure the directory structure is "
                         "[mesh_directory]/[splits]/[class names]/[ply files]")

    # Make the directories first because it's not threadsafe and also might fail.
    if files and not FLAGS.optimize_only:
        log.info('Creating directories...')
        for i, f in tqdm.tqdm(enumerate(files)):
            relpath = f.replace(mesh_directory, '')
            # log.info(f'Relpath: {relpath}')
            assert relpath[0] == '/'
            relpath = relpath[1:]
            split, synset = relpath.split('/')[:2]
            if not os.path.isdir(f'{FLAGS.dataset_directory}/{split}'):
                os.makedirs(f'{FLAGS.dataset_directory}/{split}')
            if not os.path.isdir(
                    f'{FLAGS.dataset_directory}/{split}/{synset}'):
                os.mkdir(f'{FLAGS.dataset_directory}/{split}/{synset}')
        log.info('Making dataset...')
        # Flags can't be pickled:
        output_dirs = Parallel(n_jobs=n_jobs)(
            delayed(process_one)(f, mesh_directory, FLAGS.dataset_directory,
                                 FLAGS.skip_existing, FLAGS.log_level)
            for f in tqdm.tqdm(files))
        log.info('Making dataset registry...')
    else:
        output_dirs = glob.glob(
            f'{FLAGS.dataset_directory}/*/*/*/surface_samples_from_dodeca.pts')
        output_dirs = [os.path.dirname(f) + '/' for f in output_dirs]
    output_dirs.sort(
    )  # So randomize with a fixed seed always results in the same order
    splits = {x.split('/')[-4] for x in output_dirs}
    if 'optimized' in splits:
        raise ValueError(
            f'The keyword "optimized" cannot be used for a split name, it is reserved.'
        )
    for split in splits:
        elements_of_split = [
            x for x in output_dirs if x.split('/')[-4] == split
        ]
        with open(f'{FLAGS.dataset_directory}/{split}.txt', 'wt') as f:
            f.write('\n'.join(elements_of_split) + '\n')
    log.info('Done!')

    if FLAGS.optimize:
        log.info('Precomputing optimized tfrecord files...')
        opt_dir = f'{FLAGS.dataset_directory}/optimized'
        if FLAGS.trample_optimized and os.path.isdir(opt_dir):
            for f in os.listdir(opt_dir):
                if f.endswith('.tfrecords'):
                    os.remove(os.path.join(opt_dir, f))
        if not os.path.isdir(opt_dir):
            os.mkdir(opt_dir)
        for split in splits:
            log.info(f'Optimizing split {split}...')
            elements_of_split = [
                x for x in output_dirs if x.split('/')[-4] == split
            ]
            examples_per_shard = 64
            # Make sure shards are totally random:
            random.shuffle(elements_of_split)
            n_shards = int(len(elements_of_split) / examples_per_shard)
            if len(elements_of_split) % examples_per_shard:
                n_shards += 1
            shard_dir = f'{FLAGS.dataset_directory}/optimized/{split}'
            if not os.path.isdir(shard_dir):
                os.mkdir(shard_dir)
            for shard_idx in tqdm.tqdm(range(n_shards)):
                shard_name = f'{shard_dir}/{split}-%.5d-of-%.5d.tfrecords' % (
                    shard_idx, n_shards)
                if not FLAGS.trample_optimized and os.path.isfile(shard_name):
                    continue
                start_idx = shard_idx * examples_per_shard
                end_idx = (shard_idx + 1) * examples_per_shard
                options = tf.io.TFRecordOptions(
                    tf.compat.v1.io.TFRecordCompressionType.GZIP)
                with tf.io.TFRecordWriter(shard_name,
                                          options=options) as writer:
                    to_process = elements_of_split[start_idx:end_idx]
                    serialized = Parallel(n_jobs=n_jobs)(
                        delayed(serialize)(d, FLAGS.log_level)
                        for d in to_process)
                    for s in serialized:
                        writer.write(s)
Exemple #5
0
def load_example_dict(example_directory, log_level=None):
    """Loads an example from disk and makes a str:numpy dictionary out of it."""
    if log_level:
        log.set_level(log_level)
    entry_t = time.time()
    start_t = entry_t  # Keep the function entry time around for a cumulative print.
    e = example.InferenceExample.from_directory(example_directory,
                                                verbose=False)
    end_t = time.time()
    log.verbose(f'Make example: {end_t - start_t}')
    start_t = end_t

    # The from_directory method should probably optionally take in a synset.
    bounding_box_samples = e.uniform_samples
    end_t = time.time()
    log.verbose(f'Bounding box: {end_t - start_t}')
    start_t = end_t
    # TODO(kgenova) There is a pitfall here where the depth is divided by 1000,
    # after this. So if some other depth images are provided, they would either
    # need to also be stored in the GAPS format or be artificially multiplied
    # by 1000.
    depth_renders = e.depth_images  # [20, 224, 224, 1]. 1 or 1000? trailing 1?
    assert depth_renders.shape[0] == 1
    depth_renders = depth_renders[0, ...]
    end_t = time.time()
    log.verbose(f'Depth renders: {end_t - start_t}')
    start_t = end_t

    mesh_name = e.mesh_name
    end_t = time.time()
    log.verbose(f'Mesh name: {end_t - start_t}')
    start_t = end_t

    log.verbose(f'Loading {mesh_name} from split {e.split}')
    near_surface_samples = e.near_surface_samples
    end_t = time.time()
    log.verbose(f'NSS: {end_t - start_t}')

    start_t = end_t
    grid = e.grid
    end_t = time.time()
    log.verbose(f'Grid: {end_t - start_t}')
    start_t = end_t

    world2grid = e.world2grid
    end_t = time.time()
    log.verbose(f'world2grid: {end_t - start_t}')
    start_t = end_t

    surface_point_samples = e.precomputed_surface_samples_from_dodeca
    end_t = time.time()
    log.verbose(f'surface points: {end_t - start_t}')
    log.verbose(f'load_example_dict total time: {end_t - entry_t}')
    return {
        'bounding_box_samples': bounding_box_samples,
        'depth_renders': depth_renders,
        'mesh_name': mesh_name,
        'near_surface_samples': near_surface_samples,
        'grid': grid,
        'world2grid': world2grid,
        'surface_point_samples': surface_point_samples,
    }
Exemple #6
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    log.set_level(FLAGS.log_level)
    tf.disable_v2_behavior()

    gpu_util.get_free_gpu_memory(0)
    if FLAGS.use_gpu_for_tensorflow and FLAGS.use_inference_kernel:
        log.info('Limiting TensorFlow memory by 1GB so the inference kernel'
                 ' has enough left over to run.')

    if not FLAGS.dataset_directory:
        raise ValueError('A dataset directory must be provided.')
    if not FLAGS.result_directory:
        if FLAGS.save_results or FLAGS.save_meshes or FLAGS.save_ldifs:
            raise ValueError(
                'A result directory must be provided to save results.')
    else:
        if not os.path.isdir(FLAGS.result_directory):
            os.makedirs(FLAGS.result_directory)
    if not FLAGS.use_gpu_for_tensorflow:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    log.info('Loading model...')
    # Try to detect the most common error early for a good warning message:
    if not os.path.isdir(get_model_root()):
        raise ValueError(
            f"Couldn't find a trained model at {get_model_root()}")
    encoder, decoder = load_newest_model()

    log.info('Evaluating metrics...')
    splits = [x for x in FLAGS.split.split(',') if x]
    log.info(f'Will evaluate on splits: {splits}')
    for split in splits:
        log.info(f'Starting evaluation for split {split}.')
        dataset_items = get_evaluation_directories(split)
        log.info(f'The split has {len(dataset_items)} elements.')
        results = []
        to_eval = filter_by_class(dataset_items)
        to_eval = filter_by_eval_frac(to_eval)
        for path in tqdm.tqdm(to_eval):
            e = examples.InferenceExample.from_directory(path)
            embedding = encoder.run_example(e)
            iou = decoder.iou(embedding, e)
            gt_mesh = e.gt_mesh
            mesh = decoder.extract_mesh(embedding, resolution=FLAGS.resolution)
            if FLAGS.visualize:
                # Visualize in the normalized_coordinate frame, so the camera is
                # always reasonable. Metrics are computed in the original frame.
                gaps_util.mshview([e.normalized_gt_mesh, mesh])

            # TODO(kgenova) gaps2occnet is poorly named, it is really normalized ->
            # unnormalized (where 'gaps' is the normalized training frame and 'occnet'
            # is whatever the original frame of the input mesh was)
            post_extract_start = time.time()
            mesh.apply_transform(e.gaps2occnet)

            if FLAGS.save_meshes:
                path = (f'{FLAGS.result_directory}/meshes/{split}/{e.cat}/'
                        f'{e.mesh_hash}.ply')
                if not os.path.isdir(os.path.dirname(path)):
                    os.makedirs(os.path.dirname(path))
                mesh.export(path)
            if FLAGS.save_ldifs:
                path = (f'{FLAGS.result_directory}/ldifs/{split}/{e.cat}/'
                        f'{e.mesh_hash}.txt')
                if not os.path.isdir(os.path.dirname(path)):
                    os.makedirs(os.path.dirname(path))
                decoder.savetxt(embedding, path)

            nc, fst, fs2t, chamfer = metrics.all_mesh_metrics(mesh, gt_mesh)
            log.verbose(f'Mesh: {e.mesh_name}')
            log.verbose(f'IoU: {iou}.')
            log.verbose(f'F-Score (tau): {fst}')
            log.verbose(f'Chamfer: {chamfer}')
            log.verbose(f'F-Score (2*tau): {fs2t}')
            log.verbose(f'Normal Consistency: {nc}')
            results.append({
                'key': e.mesh_name,
                'Normal Consistency': nc,
                'F-Score (tau)': fst,
                'F-Score (2*tau)': fs2t,
                'Chamfer': chamfer,
                'IoU': iou
            })
            post_extract_end = time.time()
            log.verbose(
                f'Time post extract: {post_extract_end - post_extract_start}')
        results = pd.DataFrame(results)
        if FLAGS.save_results:
            complete_csv = results.to_csv()
            result_path = f'{FLAGS.result_directory}/full_results_{split}.csv'
            file_util.writetxt(result_path, complete_csv)
        final_results = metrics.aggregate_extracted(results)
        if FLAGS.save_results:
            summary_out_path = f'{FLAGS.result_directory}/result_summary_{split}.csv'
            file_util.writetxt(summary_out_path, final_results.to_csv())
Exemple #7
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    tf.disable_v2_behavior()
    log.set_level(FLAGS.log_level)

    log.info('Making dataset...')
    if not FLAGS.dataset_directory:
        raise ValueError('A dataset directory must be provided.')
    if not os.path.isdir(FLAGS.dataset_directory):
        raise ValueError(
            f'No dataset directory found at {FLAGS.dataset_directory}')
    # TODO(kgenova) This batch size should match.
    dataset = local_inputs.make_dataset(FLAGS.dataset_directory,
                                        mode='train',
                                        batch_size=FLAGS.batch_size,
                                        split=FLAGS.split)

    # Sets up the hyperparameters and tf.Dataset
    model_config = build_model_config(dataset)
    #print('[HERE: In train] ******* Printing model_config, right after building model config')
    #print(type(model_config))
    #print(dir(model_config))
    #print('[HERE: In train] ******* Printing model_config done, right after building model config')

    # Generates the graph for a single train step, including summaries

    # shared_launcher.sif_transcoder sets more configs of model_config
    shared_launcher.sif_transcoder(model_config)
    print(
        '[HERE: In train] ******* Printing model_config, right after running shared_launcher'
    )
    print(type(model_config))
    print(dir(model_config))

    print('Type of model_config.train_op:', type(model_config.train_op))
    print('Type of model_config.loss:', type(model_config.loss))
    print('Losses used:', model_config.hparams.loss)
    print('Hparams:', model_config.hparams)
    # train_op is a tensor!
    print(
        '[HERE: In train] ******* Printing model_config done, right after running shared_launcher'
    )

    summary_op = tf.summary.merge_all()
    global_step_op = tf.compat.v1.train.get_global_step()

    saver = tf.train.Saver(max_to_keep=5,
                           pad_step_number=False,
                           save_relative_paths=True)

    init_op = tf.initialize_all_variables()

    model_root = get_model_root()

    experiment_dir = f'{model_root}/sif-transcoder-{FLAGS.experiment_name}'
    checkpoint_dir = f'{experiment_dir}/1-hparams/train/'

    if FLAGS.reserve_memory_for_inference_kernel and sys.platform != "darwin":
        print(
            '[HERE: In train] --reserve_memory_for_inference_kernel specified.'
        )

        current_free = gpu_util.get_free_gpu_memory(2)
        allowable = current_free - (1024 + 512)  # ~1GB
        allowable = min(allowable, 10000)
        allowable_fraction = allowable / current_free

        print('[HERE: In train] GPU memory usage planning:')
        #print('[HERE: In train] | allowable is limited to = 5000')
        print('[HERE: In train] | current_free = %d, allowable = %d' %
              (current_free, allowable))

        if allowable_fraction <= 0.0:
            raise ValueError(
                f"Can't leave 1GB over for the inference kernel, because"
                f" there is only {allowable} total free GPU memory.")
        log.info(
            f'TensorFlow can use up to {allowable_fraction*100}% of the total'
            ' GPU memory.')
    else:
        allowable_fraction = 1.0
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=allowable_fraction)

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session:
        #print('[HERE: In train] ******* Printing model_config, right after session creation')
        #print(type(model_config))
        #print(dir(model_config))
        #print('[HERE: In train] ******* Printing model_config done, right after session creation')

        writer = tf.summary.FileWriter(f'{experiment_dir}/log', session.graph)
        log.info('Initializing variables...')
        session.run([init_op])

        #print('[HERE: In train] ******* Printing model_config, right after session init')
        #print(type(model_config))
        #print(dir(model_config))
        #print('[HERE: In train] ******* Printing model_config done, right after session init')

        if FLAGS.visualize:
            visualize_data(session, model_config.inputs['dataset'])

        # Check whether the checkpoint directory already exists (resuming) or
        # needs to be created (new model).
        if not os.path.isdir(checkpoint_dir):
            log.info('No previous checkpoint detected, training from scratch.')
            os.makedirs(checkpoint_dir)
            # Serialize hparams so eval can load them:
            hparam_path = f'{checkpoint_dir}/hparam_pickle.txt'
            if not file_util.exists(hparam_path):
                hparams.write_hparams(model_config.hparams, hparam_path)
            initial_index = 0
        else:
            log.info(
                f'Checkpoint root {checkpoint_dir} exists, attempting to resume.'
            )
            latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
            log.info(f'Latest checkpoint: {latest_checkpoint}')
            saver.restore(session, latest_checkpoint)
            initial_index = session.run(global_step_op)
            log.info(f'The global step is {initial_index}')
            initial_index = int(initial_index)
            log.info(f'Parsed to {initial_index}')

        print('[HERE: In train] Starting training...')
        start_time = time.time()
        log_every = 10

        print(
            '[HERE: In train] ******* Printing model_config, right before training loop starts'
        )
        print(type(model_config))
        print(dir(model_config))
        print(
            '[HERE: In train] ******* Printing model_config done, right before training loop starts'
        )

        for i in range(initial_index, FLAGS.train_step_count):
            print(
                '[HERE: In train] Starting training, within loop, before log verbose...'
            )
            log.verbose(f'Starting step {i}...')
            print(f'[HERE: In train] Starting step {i}...')
            print(
                '[HERE: In train] Starting training, within loop, after verbose...'
            )
            is_summary_step = i % FLAGS.summary_step_interval == 0

            # running the session to get the results
            if is_summary_step:
                #print('[HERE: In train] This is a summary step. Computing summaries and loss...')
                _, summaries, loss = session.run(
                    [model_config.train_op, summary_op, model_config.loss])
                writer.add_summary(summaries, i)
                print(
                    '[HERE: In train] This is a summary step. Done writing summaries and loss...'
                )
            else:
                print(
                    '[HERE: In train] This is not a summary step. Computing loss...'
                )
                _, loss = session.run(
                    [model_config.train_op, model_config.loss])
                print(
                    '[HERE: In train] This is not a summary step. Done computing loss...'
                )
            if not (i % log_every):
                print('[HERE: In train] This is a log step. Logging...')
                end_time = time.time()
                steps_per_second = float(log_every) / (end_time - start_time)
                start_time = end_time
                log.info(
                    f'Step: {i}\tLoss: {loss}\tSteps/second: {steps_per_second}'
                )
                print('[HERE: In train] This is a log step. Logging done...')

            is_checkpoint_step = i % FLAGS.checkpoint_interval == 0
            if is_checkpoint_step or i == FLAGS.train_step_count - 1:
                print(
                    '[HERE: In train] This is a saving checkpoint step. Saving model...'
                )
                ckpt_path = os.path.join(checkpoint_dir, 'model.ckpt')
                log.info(f'Writing checkpoint to {ckpt_path}...')
                saver.save(session, ckpt_path, global_step=i)
                print(
                    '[HERE: In train] This is a saving checkpoint step. Done saving model...'
                )

            print('[HERE: In train] This step done. Starting a new step...')
        log.info('Done training!')