def write_features(dataset_dir, save_dir): """ :param dataset_dir: :param save_dir: :return: """ if not ops.exists(save_dir): os.makedirs(save_dir) print('Initialize the dataset provider ......') provider = data_provider.TextDataProvider(dataset_dir=dataset_dir, annotation_name='sample.txt', validation_set=True, validation_split=0.15, shuffle=None, normalization=None) print('Dataset provider intialize complete') feature_io = data_utils.TextFeatureIO() # write train tfrecords print('Start writing training tf records') train_images = provider.train.images train_images = [bytes(list(np.reshape(tmp, [100 * 32 * 3]))) for tmp in train_images] train_labels = provider.train.labels train_imagenames = provider.train.imagenames train_tfrecord_path = ops.join(save_dir, 'train_feature.tfrecords') feature_io.writer.write_features(tfrecords_path=train_tfrecord_path, labels=train_labels, images=train_images, imagenames=train_imagenames) # write test tfrecords print('Start writing testing tf records') test_images = provider.test.images test_images = [bytes(list(np.reshape(tmp, [100 * 32 * 3]))) for tmp in test_images] test_labels = provider.test.labels test_imagenames = provider.test.imagenames test_tfrecord_path = ops.join(save_dir, 'test_feature.tfrecords') feature_io.writer.write_features(tfrecords_path=test_tfrecord_path, labels=test_labels, images=test_images, imagenames=test_imagenames) # write val tfrecords print('Start writing validation tf records') val_images = provider.validation.images val_images = [bytes(list(np.reshape(tmp, [100 * 32 * 3]))) for tmp in val_images] val_labels = provider.validation.labels val_imagenames = provider.validation.imagenames val_tfrecord_path = ops.join(save_dir, 'validation_feature.tfrecords') feature_io.writer.write_features(tfrecords_path=val_tfrecord_path, labels=val_labels, images=val_images, imagenames=val_imagenames) return
def write_features(dataset_dir, save_dir, batch_size): """ :param dataset_dir: :param save_dir: :param batch_size: :return: """ if not ops.exists(save_dir): os.makedirs(save_dir) print('Initialize the dataset provider ......') provider = data_provider.TextDataProvider(dataset_dir=dataset_dir, annotation_name='sample.txt', validation_set=True, validation_split=0.05, shuffle='every_epoch', normalization=None) print('Dataset provider intialize complete') feature_io = data_utils.TextFeatureIO() # write train tfrecords print('Start writing training tf records') train_images_nums = provider.train.num_examples epoch_nums = int(math.ceil(train_images_nums / batch_size)) for loop in tqdm.tqdm(range(epoch_nums)): train_images, train_labels, train_imagenames = provider.train.next_batch(batch_size=batch_size) train_images = [cv2.resize(tmp, (width,32)) for tmp in train_images] train_images = [bytes(list(np.reshape(tmp, [width * 32*3]))) for tmp in train_images] if loop*batch_size+batch_size > train_images_nums: train_tfrecord_path = ops.join(save_dir, 'train_feature_{:d}_{:d}.tfrecords'.format( loop * batch_size, train_images_nums)) else: train_tfrecord_path = ops.join(save_dir, 'train_feature_{:d}_{:d}.tfrecords'.format( loop*batch_size, loop*batch_size+batch_size)) feature_io.writer.write_features(tfrecords_path=train_tfrecord_path, labels=train_labels, images=train_images, imagenames=train_imagenames) # write test tfrecords print('Start writing testing tf records') test_images_nums = provider.test.num_examples epoch_nums = int(math.ceil(test_images_nums / batch_size)) for loop in tqdm.tqdm(range(epoch_nums)): test_images, test_labels, test_imagenames = provider.test.next_batch(batch_size=batch_size) test_images = [cv2.resize(tmp, (32, width)) for tmp in test_images] test_images = [bytes(list(np.reshape(tmp, [32 * width * 3]))) for tmp in test_images] if loop * batch_size + batch_size > test_images_nums: test_tfrecord_path = ops.join(save_dir, 'test_feature_{:d}_{:d}.tfrecords'.format( loop*batch_size, test_images_nums)) else: test_tfrecord_path = ops.join(save_dir, 'test_feature_{:d}_{:d}.tfrecords'.format( loop * batch_size, loop * batch_size + batch_size)) feature_io.writer.write_features(tfrecords_path=test_tfrecord_path, labels=test_labels, images=test_images, imagenames=test_imagenames) # write val tfrecords print('Start writing validation tf records') val_image_nums = provider.validation.num_examples epoch_nums = int(math.ceil(val_image_nums / batch_size)) for loop in tqdm.tqdm(range(epoch_nums)): val_images, val_labels, val_imagenames = provider.validation.next_batch(batch_size=batch_size) val_images = [cv2.resize(tmp, (32, width)) for tmp in val_images] val_images = [bytes(list(np.reshape(tmp, [32 * width * 3]))) for tmp in val_images] if loop*batch_size+batch_size > val_image_nums: val_tfrecord_path = ops.join(save_dir, 'validation_feature_{:d}_{:d}.tfrecords'.format( loop*batch_size, val_image_nums)) else: val_tfrecord_path = ops.join(save_dir, 'validation_feature_{:d}_{:d}.tfrecords'.format( loop * batch_size, loop*batch_size+batch_size)) feature_io.writer.write_features(tfrecords_path=val_tfrecord_path, labels=val_labels, images=val_images, imagenames=val_imagenames) return
images=images, imagenames=imagenames) if __name__ == '__main__': args = init_args() config = load_config(args.config_file) if not ops.exists(args.dataset_dir): raise ValueError('Dataset {:s} doesn\'t exist'.format( args.dataset_dir)) os.makedirs(args.save_dir, exist_ok=True) print('Initializing the dataset provider...') provider = data_provider.TextDataProvider( dataset_dir=args.dataset_dir, annotation_name=args.annotation_file, validation_set=args.validation_split > 0, validation_split=args.validation_split, shuffle='every_epoch', normalization=args.normalization, input_size=config.cfg.ARCH.INPUT_SIZE) write_tfrecords(provider.train, "train", args.save_dir, args.charset_dir) write_tfrecords(provider.test, "test", args.save_dir, args.charset_dir) write_tfrecords(provider.validation, "val", args.save_dir, args.charset_dir)
def write_features(dataset_dir: str, save_dir: str, annotation_name: str, validation_split: float, normalization: str, char_maps: str): """ Processes training and test data creating Tensorflow records. :param dataset_dir: root to Train and Test datasets :param save_dir: Where to store the tf records :param annotation_name: Name of annotations file in each dataset dir :param validation_split: Fraction of training data to use for validation :param normalization: Perform normalization on images 'divide_255', 'divide_256' :param build_char_maps: Whether to extract character maps from training and test labels """ os.makedirs(save_dir, exist_ok=True) print('Initializing the dataset provider... ', end='', flush=True) provider = data_provider.TextDataProvider( dataset_dir=dataset_dir, annotation_name=annotation_name, validation_set=validation_split > 0, validation_split=validation_split, shuffle='every_epoch', normalization=normalization) print('done.') if char_maps is not None: char_dict_path = os.path.join(char_maps, "char_dict.json") ord_map_dict_path = os.path.join(char_maps, "ord_map.json") else: char_dict_path = os.path.join("data/char_dict", "char_dict.json") ord_map_dict_path = os.path.join("data/char_dict", "ord_map.json") # write train tfrecords print('Writing tf records for training...') train_images = provider.train.images train_images = [ bytes(list(np.reshape(tmp, [100 * 32 * 3]))) for tmp in train_images ] train_labels = provider.train.labels train_imagenames = provider.train.imagenames train_tfrecord_path = ops.join(save_dir, 'train_feature.tfrecords') if char_maps is not None: os.makedirs(os.path.dirname(char_maps), exist_ok=True) all_chars = reduce(lambda a, b: set(a).union(set(b)), train_labels) CharDictBuilder.write_char_dict( all_chars, os.path.join(char_maps, "char_dict.json")) CharDictBuilder.map_ord_to_index( all_chars, os.path.join(char_maps, "ord_map.json")) print(" (character maps written)") feature_io = TextFeatureIO(char_dict_path, ord_map_dict_path) feature_io.writer.write_features(tfrecords_path=train_tfrecord_path, labels=train_labels, images=train_images, imagenames=train_imagenames) # write test tfrecords print('Writing tf records for testing...') test_images = provider.test.images test_images = [ bytes(list(np.reshape(tmp, [100 * 32 * 3]))) for tmp in test_images ] test_labels = provider.test.labels test_imagenames = provider.test.imagenames test_tfrecord_path = ops.join(save_dir, 'test_feature.tfrecords') if char_maps is not None: os.makedirs(os.path.dirname(char_maps), exist_ok=True) all_chars = all_chars.union( reduce(lambda a, b: set(a).union(set(b)), test_labels)) CharDictBuilder.write_char_dict( all_chars, os.path.join(char_maps, "char_dict.json")) CharDictBuilder.map_ord_to_index( all_chars, os.path.join(char_maps, "ord_map.json")) print(" (updated character maps written)") feature_io = TextFeatureIO(char_dict_path, ord_map_dict_path) feature_io.writer.write_features(tfrecords_path=test_tfrecord_path, labels=test_labels, images=test_images, imagenames=test_imagenames) # write val tfrecords print('Writing tf records for validation...') val_images = provider.validation.images val_images = [ bytes(list(np.reshape(tmp, [100 * 32 * 3]))) for tmp in val_images ] val_labels = provider.validation.labels val_imagenames = provider.validation.imagenames val_tfrecord_path = ops.join(save_dir, 'validation_feature.tfrecords') if char_maps is not None: os.makedirs(os.path.dirname(char_maps), exist_ok=True) all_chars = all_chars.union( reduce(lambda a, b: set(a).union(set(b)), val_labels)) CharDictBuilder.write_char_dict( all_chars, os.path.join(char_maps, "char_dict.json")) CharDictBuilder.map_ord_to_index( all_chars, os.path.join(char_maps, "ord_map.json")) print(" (updated character maps written)") feature_io = TextFeatureIO(char_dict_path, ord_map_dict_path) feature_io.writer.write_features(tfrecords_path=val_tfrecord_path, labels=val_labels, images=val_images, imagenames=val_imagenames)