コード例 #1
0
def join_created_images_records(out_dir, num_jobs):
    jobs_dirs_glob = path.join(out_dir, '{}*'.format(_JOB_SUBDIR_PREFIX))
    jobs_dirs = glob.glob(jobs_dirs_glob)
    assert len(jobs_dirs) == num_jobs, 'Expected {} subdirs, got {}'.format(
        num_jobs, jobs_dirs)

    records = glob.glob(
        path.join(jobs_dirs_glob, '*.{}'.format(_TF_RECORD_EXT)))
    assert len(records) > 0, 'Did not find any records in {}/{}_*'.format(
        out_dir, _JOB_SUBDIR_PREFIX)

    base_records_file_name = path.basename(
        records[0]).split('_')[0]  # get SHARD from out_dir/job_x/SHARD_xxx.ext
    for shard_number, records_p in enumerate(
            printing.ProgressPrinter('Moving records...', iter_list=records)):
        target_p = path.join(
            out_dir, _records_file_name(base_records_file_name, shard_number))
        os.rename(records_p, target_p)

    print('Removing empty job dirs...')
    list(map(os.removedirs,
             jobs_dirs))  # remove all job dirs, which are now empty

    print('Counting...')
    all_records_glob = path.join(out_dir, '*.{}'.format(_TF_RECORD_EXT))
    printing.print_join(
        '{}: {}'.format(path.basename(p), _number_of_examples_in_record(p))
        for p in sorted(glob.glob(all_records_glob)))
コード例 #2
0
def create_records_with_feature_dicts(feature_dicts,
                                      out_dir,
                                      num_per_shard,
                                      max_shards=None,
                                      file_name='shard'):
    """
    :param feature_dicts: iterator yielding dictionaries with tf.train.Feature as values, to encode as features
    :param out_dir:
    :param num_per_shard:
    :param file_name:
    :return:
    """
    os.makedirs(out_dir, exist_ok=True)
    writer = None
    with printing.ProgressPrinter() as progress_printer:
        for count, feature in enumerate(feature_dicts):
            if count % num_per_shard == 0:
                progress_printer.finish_line()
                if writer:
                    writer.close()
                shard_number = count // num_per_shard
                if max_shards is not None and shard_number == max_shards:
                    print('Created {} shards...'.format(max_shards))
                    return
                record_p = path.join(
                    out_dir, _records_file_name(file_name, shard_number))
                assert not path.exists(
                    record_p), 'Record already exists! {}'.format(record_p)
                print('Creating {}...'.format(record_p))
                writer = tf.python_io.TFRecordWriter(record_p)
            example = tf.train.Example(features=tf.train.Features(
                feature=feature))
            writer.write(example.SerializeToString())
            progress_printer.update((count % num_per_shard) / num_per_shard)
    if writer:
        writer.close()
    else:
        print('Nothing written...')
コード例 #3
0
ファイル: maker.py プロジェクト: voletiv/hdf5_dataloader
def make_hdf5_files(out_dir,
                    images_glob,
                    shuffle=True,
                    num_per_shard=1000,
                    max_shards=None,
                    min_size=None,
                    name_fmt='shard_{:010d}.hdf5',
                    force=False):
    """
    Notes:
        - total output file size may be much bigger, as JPGs get decompressed and stored as uint8
    :param out_dir:
    :param images_glob:
    :param shuffle:
    :param num_per_shard:
    :param max_shards:
    :param min_size:
    :param name_fmt:
    :param force:
    :return:
    """
    if os.path.isdir(out_dir):
        if not force:
            raise ValueError('{} already exists.'.format(out_dir))
        print('Removing {}...'.format(out_dir))
        time.sleep(1)
        shutil.rmtree(out_dir)
    os.makedirs(out_dir)

    with open(os.path.join(out_dir, 'log'), 'w') as f:
        info_str = '\n'.join(
            '{}={}'.format(k, v)
            for k, v in [('out_dir', out_dir), (
                'images_glob',
                images_glob), ('shuffle',
                               shuffle), ('num_per_shard', num_per_shard),
                         ('max_shards',
                          max_shards), ('min_size',
                                        min_size), ('name_fmt',
                                                    name_fmt), ('force',
                                                                force)])
        print(info_str)
        f.write(info_str + '\n')

    print('Getting images...')
    image_ps = sorted(glob.glob(images_glob))
    assert len(image_ps) > 0, 'No matches for {}'.format(images_glob)
    print('Found {} images'.format(len(image_ps)))

    if shuffle:
        print('Shuffling...')
        random.shuffle(image_ps)
    num_shards_total = str(len(image_ps) // num_per_shard)

    if min_size:
        print('Filtering for >= {}...'.format(min_size))
        image_ps = (p for p in image_ps if _big_enough(p, min_size))
        num_shards_total = '<=' + num_shards_total

    writer = None
    count = 0
    shard_ps = []
    with printing.ProgressPrinter() as progress_printer:
        for count, image_p in enumerate(image_ps):
            if count % num_per_shard == 0:
                progress_printer.finish_line()
                if writer:
                    writer.close()
                shard_number = count // num_per_shard
                if max_shards is not None and shard_number == max_shards:
                    print('Created {} shards...'.format(max_shards))
                    return
                shard_p = os.path.join(out_dir, name_fmt.format(shard_number))
                assert not os.path.exists(
                    shard_p), 'Record already exists! {}'.format(shard_p)
                print('Creating {} [{}/{}]...'.format(shard_p, shard_number,
                                                      num_shards_total))
                writer = h5py.File(shard_p, 'w')
                shard_ps.append(shard_p)
            image = Image.open(image_p).convert('RGB')
            image = np.array(image, np.uint8).transpose((2, 0, 1))
            assert image.shape[0] == 3
            index = str(
                count %
                num_per_shard)  # expected by HDF5DataLoader, TODO: document
            writer.create_dataset(index, data=image)
            progress_printer.update((count % num_per_shard) / num_per_shard)
    if writer:
        writer.close()
        assert len(shard_ps)
        # writing this to num_per_shard.pkl
        p_to_num_per_shard = {
            os.path.basename(shard_p): num_per_shard
            for shard_p in shard_ps
        }
        last_shard_p = shard_ps[-1]
        with h5py.File(last_shard_p, 'r') as f:
            p_to_num_per_shard[os.path.basename(last_shard_p)] = len(f.keys())
        with open(os.path.join(out_dir, 'num_per_shard.pkl'), 'wb') as f:
            pickle.dump(p_to_num_per_shard, f)
    else:
        print('Nothing written, processed {} files...'.format(count))