def _process_dataset(name, filenames, labels, bbox_info, num_shards): """ 이미지 파일 목록을 읽어들여 TFRecord 객체로 변환하는 함수 :param name: string, 데이터 고유 문자열 (train, validation 등) :param filenames: list of strings; 이미지 파일 경로 리스트. :param labels: list of integer; 이미지에 대한 정수화된 정답 레이블 리스트 :param bbox_info: double dict of label id to file id. 사용하지 않는 경우 None 을 입력 :param num_shards: 데이터 집합을 샤딩할 갯수. """ assert len(filenames) == len(labels) shard_offsets = dataset_utils.make_shard_offsets(len(filenames), FLAGS.num_threads, num_shards) shard_output_filenames = dataset_utils.make_shard_filenames( name, len(filenames), FLAGS.num_threads, num_shards) def _process_batch(thread_index): offsets = shard_offsets[thread_index] output_filenames = shard_output_filenames[thread_index] _process_image_files_batch(thread_index, offsets, output_filenames, filenames, labels, bbox_info) dataset_utils.thread_execute(FLAGS.num_threads, _process_batch) dataset_utils.log('%s: Finished writing all %d images in data set.' % (datetime.now(), len(filenames)))
def _process_image_files_batch(thread_index, offsets, output_filenames, filenames, labels, bbox_info): """ 하나의 스레드 단위에서 이미지 리스트를 읽어 TRRecord 타입으로 변환하는 함수 :param thread_index: 현재 작업중인 thread 번호. :param offsets: offset list. 이미지 목록 중 현재 스레드에서 처리해야 할 offset 값으로 shard 갯수만큼 리스트로 제공 :param output_filenames: 출력 파일 이름으로 shard 갯수만큼 리스트로 제공. :param filenames: 처리해야 할 전체 이미지 파일 리스트 :param labels: 처리해야 할 전체 이미지 레이블 리스트 """ assert len(offsets) == len(output_filenames) assert len(filenames) == len(labels) num_files_in_thread = offsets[-1][1] - offsets[0][0] counter = 0 # 하나의 thread 에는 여러 개의 shard 가 할당될 수 있다. for offset, output_filename in zip(offsets, output_filenames): output_file = os.path.join(FLAGS.output_dir, output_filename) writer = tf.python_io.TFRecordWriter(output_file) # offset 에는 현재 shard 에 대한 (start, end) offset이 저장되어 있음. files_in_shard = np.arange(offset[0], offset[1], dtype=int) shard_counter = 0 for i in files_in_shard: filename = filenames[i] label = labels[i] file_id = os.path.splitext(os.path.basename(filename))[0] if bbox_info is None: bbox = None else: bbox = bbox_info[file_id] # try: image_data, height, width, image_format = _process_image( filename, bbox) # except ValueError: # dataset_utils.log('[thread %2d]: Invalid image found. %s - [skip].' % (thread_index, filename)) # continue example = data_util.convert_to_example_without_bbox( image_data, 'jpg', label, height, width) writer.write(example.SerializeToString()) counter += 1 shard_counter += 1 if not counter % 1000: dataset_utils.log( '%s [thread %2d]: Processed %d of %d images in thread batch.' % (datetime.now(), thread_index, counter, num_files_in_thread)) writer.close() dataset_utils.log( '%s [thread %2d]: Wrote %d images to %s' % (datetime.now(), thread_index, shard_counter, output_file))
def main(unused_argv): if (FLAGS.data_dir is None) or (FLAGS.output_dir is None): parser.print_help() return dataset_utils.log('Make Naver-Food TFRecord dataset by label.') if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) if FLAGS.use_bbox: bbox_info = _get_bbox_info(FLAGS.data_dir) dataset_utils.log(' - Use bounding box info. (opt. ON)') else: bbox_info = None source_dir = os.path.join(FLAGS.data_dir, 'images') filenames_train, labels_train, id_to_name, total = _find_image_files('train', source_dir) dataset_utils.log('Convert [train] dataset.') _process_dataset('train', filenames_train, labels_train, bbox_info, 128) filenames_val, labels_val, id_to_name, total = _find_image_files('validation', source_dir) dataset_utils.log('Convert [validation] dataset.') _process_dataset('validation', filenames_val, labels_val, bbox_info, 16)
def main(unused_argv): if (FLAGS.data_dir is None) or (FLAGS.output_dir is None): parser.print_help() return if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) data_path = FLAGS.data_dir image_filepath = os.path.join(data_path, "fgvc-aircraft-2013b.tar.gz") if not os.path.exists(os.path.join(data_path, 'fgvc-aircraft-2013b')): subprocess.call([ "tar", "zxvf", image_filepath.replace("\\", "/"), "-C", data_path.replace("\\", "/"), "--force-local" ]) data_path = os.path.join(data_path, 'fgvc-aircraft-2013b') train_filenames, train_labels = _get_filenames_and_labels( data_path, 'data/images_train.txt', 'data/images_variant_train.txt', 'data/variants.txt') train_filenames2, train_labels2 = _get_filenames_and_labels( data_path, 'data/images_val.txt', 'data/images_variant_val.txt', 'data/variants.txt') train_filenames.extend(train_filenames2) train_labels.extend(train_labels2) shuffled_index = list(range(len(train_filenames))) random.seed(12345) random.shuffle(shuffled_index) train_filenames = [train_filenames[i] for i in shuffled_index] train_labels = [train_labels[i] for i in shuffled_index] validation_filenames, validation_labels = _get_filenames_and_labels( data_path, 'data/images_test.txt', 'data/images_variant_test.txt', 'data/variants.txt', shuffle=False) dataset_utils.log('Convert [train] dataset.') _process_dataset('train', train_filenames, train_labels, FLAGS.train_shards) dataset_utils.log('Convert [validation] dataset.') _process_dataset('validation', validation_filenames, validation_labels, FLAGS.validation_shards)
def main(unused_argv): if (FLAGS.data_dir is None) or (FLAGS.output_dir is None): parser.print_help() return if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) if FLAGS.use_bbox: bbox_info = _get_bbox_info() dataset_utils.log(' - Use bounding box info. (opt. ON)') else: bbox_info = None filenames, labels, total = _find_image_files('train', FLAGS.data_dir) _process_dataset('train', filenames, labels, bbox_info) filenames, labels, total = _find_image_files('validation', FLAGS.data_dir) _process_dataset('validation', filenames, labels, bbox_info)
def main(_): if (FLAGS.data_dir is None) or (FLAGS.output_dir is None): parser.print_help() return assert not FLAGS.train_shards % FLAGS.num_threads, ( 'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards') assert not FLAGS.validation_shards % FLAGS.num_threads, ( 'Please make the FLAGS.num_threads commensurate with FLAGS.validation_shards') print('Saving results to {}'.format(FLAGS.output_dir)) dataset_utils.log('Make UEC-food100 TFRecord dataset.') if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) label_path = os.path.join(FLAGS.data_dir, format("meta/labels.txt")) train_path = os.path.join(FLAGS.data_dir, format("meta/train.txt")) validation_path = os.path.join(FLAGS.data_dir, format("meta/test.txt")) train_filenames, train_labels = _get_filenames_and_labels( FLAGS.data_dir, train_path, label_path, shuffle=True) validation_filenames, validation_labels = _get_filenames_and_labels( FLAGS.data_dir, validation_path, label_path, shuffle=False) dataset_utils.log('Convert [train] dataset.') _process_dataset('train', train_filenames, train_labels, FLAGS.train_shards) dataset_utils.log('Convert [validation] dataset.') _process_dataset('validation', validation_filenames, validation_labels, FLAGS.validation_shards)
def main(_): if (FLAGS.input_dir is None) or (FLAGS.output_dir is None): parser.print_help() return assert not FLAGS.train_shards % FLAGS.num_threads, ( 'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards') assert not FLAGS.validation_shards % FLAGS.num_threads, ( 'Please make the FLAGS.num_threads commensurate with FLAGS.validation_shards') print('Saving results to {}'.format(FLAGS.output_dir)) dataset_utils.log('Make UEC-food100 TFRecord dataset.') if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) # train_txt = '/home1/irteam/user/jklee/dataset/Stanford_Online_Products/Stanford_Online_Products/Ebay_train.txt' # test_txt = '/home1/irteam/user/jklee/dataset/Stanford_Online_Products/Stanford_Online_Products/Ebay_test.txt' train_filenames, train_labels = get_filenames_and_labels(FLAGS.input_dir) validation_filenames, validation_labels = get_filenames_and_labels(FLAGS.input_dir, is_training=False) # train_filenames, train_labels = _get_filenames_and_labels( # FLAGS.data_dir, train_path, label_path, shuffle=True) # # validation_filenames, validation_labels = _get_filenames_and_labels( # FLAGS.data_dir, validation_path, label_path, shuffle=False) dataset_utils.log('Convert [train] dataset.') _process_dataset('train', train_filenames, train_labels, FLAGS.train_shards) dataset_utils.log('Convert [validation] dataset.') _process_dataset('validation', validation_filenames, validation_labels, FLAGS.validation_shards)
def main(unused_argv): if (FLAGS.data_dir is None) or (FLAGS.output_dir is None): parser.print_help() return if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) if FLAGS.use_bbox: bbox_info = _get_bbox_info() dataset_utils.log(' - Use bounding box info. (opt. ON)') else: bbox_info = None filenames_train, filenames_val, labels_train, labels_val, total_train, total_val \ = _find_image_files(FLAGS.data_dir) # _process_dataset('train', filenames_train, labels_train, bbox_info) # # _write_label_id_to_name('train', FLAGS.output_dir, id_to_name) # dataset_utils.log('Finished writing all %d images in train data set.' % total_train) # # _process_dataset('validation', filenames_val, labels_val, bbox_info) # # _write_label_id_to_name('validation', FLAGS.output_dir, id_to_name) # dataset_utils.log('Finished writing all %d images in validation data set.' % total_val) print('filenames_train', len(filenames_train)) dataset_utils.log('Convert [train] dataset.') _process_dataset('train', filenames_train, labels_train, FLAGS.train_shards) dataset_utils.log('Convert [validation] dataset.') _process_dataset('validation', filenames_val, labels_val, FLAGS.validation_shards)
def main(_): if (FLAGS.data_dir is None) or (FLAGS.output_dir is None): parser.print_help() return assert not FLAGS.train_shards % FLAGS.num_threads, ( 'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards' ) assert not FLAGS.validation_shards % FLAGS.num_threads, ( 'Please make the FLAGS.num_threads commensurate with FLAGS.validation_shards' ) print('Saving results to {}'.format(FLAGS.output_dir)) dataset_utils.log('Make UEC-food100 TFRecord dataset.') if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) if FLAGS.use_bbox: bbox_info = _get_bbox_info(FLAGS.data_dir) dataset_utils.log(' - Use bounding box info. (opt. ON)') else: bbox_info = None # val{}.txt 파일 중 0~3 은 train set 으로 사용하고 4는 validation set 으로 사용. train_filenames, train_labels = _get_filenames_and_labels([0, 1, 2, 3], shuffle=True) validation_filenames, validation_labels = _get_filenames_and_labels( [4], shuffle=False) dataset_utils.log('Convert [train] dataset.') _process_dataset('train', train_filenames, train_labels, bbox_info, FLAGS.train_shards) dataset_utils.log('Convert [validation] dataset.') _process_dataset('validation', validation_filenames, validation_labels, bbox_info, FLAGS.validation_shards) dataset_utils.log('Make UEC-food100 TFRecord dataset. [OK]')