def join_created_images_records(out_dir, num_jobs): jobs_dirs_glob = path.join(out_dir, '{}*'.format(_JOB_SUBDIR_PREFIX)) jobs_dirs = glob.glob(jobs_dirs_glob) assert len(jobs_dirs) == num_jobs, 'Expected {} subdirs, got {}'.format( num_jobs, jobs_dirs) records = glob.glob( path.join(jobs_dirs_glob, '*.{}'.format(_TF_RECORD_EXT))) assert len(records) > 0, 'Did not find any records in {}/{}_*'.format( out_dir, _JOB_SUBDIR_PREFIX) base_records_file_name = path.basename( records[0]).split('_')[0] # get SHARD from out_dir/job_x/SHARD_xxx.ext for shard_number, records_p in enumerate( printing.ProgressPrinter('Moving records...', iter_list=records)): target_p = path.join( out_dir, _records_file_name(base_records_file_name, shard_number)) os.rename(records_p, target_p) print('Removing empty job dirs...') list(map(os.removedirs, jobs_dirs)) # remove all job dirs, which are now empty print('Counting...') all_records_glob = path.join(out_dir, '*.{}'.format(_TF_RECORD_EXT)) printing.print_join( '{}: {}'.format(path.basename(p), _number_of_examples_in_record(p)) for p in sorted(glob.glob(all_records_glob)))
def create_records_with_feature_dicts(feature_dicts, out_dir, num_per_shard, max_shards=None, file_name='shard'): """ :param feature_dicts: iterator yielding dictionaries with tf.train.Feature as values, to encode as features :param out_dir: :param num_per_shard: :param file_name: :return: """ os.makedirs(out_dir, exist_ok=True) writer = None with printing.ProgressPrinter() as progress_printer: for count, feature in enumerate(feature_dicts): if count % num_per_shard == 0: progress_printer.finish_line() if writer: writer.close() shard_number = count // num_per_shard if max_shards is not None and shard_number == max_shards: print('Created {} shards...'.format(max_shards)) return record_p = path.join( out_dir, _records_file_name(file_name, shard_number)) assert not path.exists( record_p), 'Record already exists! {}'.format(record_p) print('Creating {}...'.format(record_p)) writer = tf.python_io.TFRecordWriter(record_p) example = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(example.SerializeToString()) progress_printer.update((count % num_per_shard) / num_per_shard) if writer: writer.close() else: print('Nothing written...')
def make_hdf5_files(out_dir, images_glob, shuffle=True, num_per_shard=1000, max_shards=None, min_size=None, name_fmt='shard_{:010d}.hdf5', force=False): """ Notes: - total output file size may be much bigger, as JPGs get decompressed and stored as uint8 :param out_dir: :param images_glob: :param shuffle: :param num_per_shard: :param max_shards: :param min_size: :param name_fmt: :param force: :return: """ if os.path.isdir(out_dir): if not force: raise ValueError('{} already exists.'.format(out_dir)) print('Removing {}...'.format(out_dir)) time.sleep(1) shutil.rmtree(out_dir) os.makedirs(out_dir) with open(os.path.join(out_dir, 'log'), 'w') as f: info_str = '\n'.join( '{}={}'.format(k, v) for k, v in [('out_dir', out_dir), ( 'images_glob', images_glob), ('shuffle', shuffle), ('num_per_shard', num_per_shard), ('max_shards', max_shards), ('min_size', min_size), ('name_fmt', name_fmt), ('force', force)]) print(info_str) f.write(info_str + '\n') print('Getting images...') image_ps = sorted(glob.glob(images_glob)) assert len(image_ps) > 0, 'No matches for {}'.format(images_glob) print('Found {} images'.format(len(image_ps))) if shuffle: print('Shuffling...') random.shuffle(image_ps) num_shards_total = str(len(image_ps) // num_per_shard) if min_size: print('Filtering for >= {}...'.format(min_size)) image_ps = (p for p in image_ps if _big_enough(p, min_size)) num_shards_total = '<=' + num_shards_total writer = None count = 0 shard_ps = [] with printing.ProgressPrinter() as progress_printer: for count, image_p in enumerate(image_ps): if count % num_per_shard == 0: progress_printer.finish_line() if writer: writer.close() shard_number = count // num_per_shard if max_shards is not None and shard_number == max_shards: print('Created {} shards...'.format(max_shards)) return shard_p = os.path.join(out_dir, name_fmt.format(shard_number)) assert not os.path.exists( shard_p), 'Record already exists! {}'.format(shard_p) print('Creating {} [{}/{}]...'.format(shard_p, shard_number, num_shards_total)) writer = h5py.File(shard_p, 'w') shard_ps.append(shard_p) image = Image.open(image_p).convert('RGB') image = np.array(image, np.uint8).transpose((2, 0, 1)) assert image.shape[0] == 3 index = str( count % num_per_shard) # expected by HDF5DataLoader, TODO: document writer.create_dataset(index, data=image) progress_printer.update((count % num_per_shard) / num_per_shard) if writer: writer.close() assert len(shard_ps) # writing this to num_per_shard.pkl p_to_num_per_shard = { os.path.basename(shard_p): num_per_shard for shard_p in shard_ps } last_shard_p = shard_ps[-1] with h5py.File(last_shard_p, 'r') as f: p_to_num_per_shard[os.path.basename(last_shard_p)] = len(f.keys()) with open(os.path.join(out_dir, 'num_per_shard.pkl'), 'wb') as f: pickle.dump(p_to_num_per_shard, f) else: print('Nothing written, processed {} files...'.format(count))