def test_should_save_and_read_pairs_correctly(batch_size): images_dataset: DictsDataset paths_dataset: DictsDataset images_dataset, paths_dataset = gen.dicts_dataset(batch_size=batch_size, paired=True, save_on_disc=True) raw_dataset_fragment = testing_helpers.dicts_dataset_to_raw_dataset_fragment( images_dataset) dataset_desc = gen.dataset_desc( storage_method=DatasetStorageMethod.ON_DISC, image_dimensions=ImageDimensions(testing_consts.TEST_IMAGE_SIZE)) dataset_spec = gen.dataset_spec(description=dataset_desc, raw_dataset_fragment=raw_dataset_fragment) tfrecord_full_path = preparing_data.save_to_tfrecord( paths_dataset.features, paths_dataset.labels, 'data', dataset_spec) assert utils.check_filepath(tfrecord_full_path, is_directory=False, is_empty=False) dataset = reading_tfrecords.assemble_dataset(tfrecord_full_path.parent, dataset_spec) dataset = dataset.repeat() dataset = dataset.batch(batch_size) first_batch = dataset.make_one_shot_iterator().get_next() _check_paired_result( first_batch, (images_dataset.features.left, images_dataset.features.right), images_dataset.labels)
def test_should_create_same_pairs_without_repeating_pairs( number_translation_features_dict): pairs, _ = create_same_pairs(number_translation_features_dict, MIN_PAIRS_NUM, gen.dataset_spec(repeating_pairs=False)) seen = set() for left, right in pairs: assert (left.trans, right.trans) not in seen seen.add((left.trans, right.trans))
def test_processed_input_data_dirs_placement(encoding, paired): processed_input_data_dir = filenames.get_processed_input_data_dir( gen.dataset_spec(encoding=encoding, paired=paired)) expected = '/tf/datasets/' + \ ((consts.INPUT_DATA_NOT_ENCODED_DIR_FRAGMENT + '/') if not encoding else '') + \ (consts.INPUT_DATA_NOT_PAIRED_DIR_FRAGMENT if not paired else consts.INPUT_DATA_PAIRED_DIR_FRAGMENT) assert_that(str(processed_input_data_dir), ends_with(expected))
def test_should_save_and_read_unpaired_correctly(batch_size): images_dataset: DictsDataset = gen.dicts_dataset(batch_size=batch_size, paired=False) tfrecord_full_path = preparing_data.save_to_tfrecord( images_dataset.features, images_dataset.labels, 'data', gen.dataset_spec(paired=False)) assert utils.check_filepath(tfrecord_full_path, is_directory=False, is_empty=False) dataset = reading_tfrecords.assemble_dataset( tfrecord_full_path.parent, gen.dataset_spec(paired=False)) dataset = dataset.repeat() dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() first_batch = iterator.get_next() _check_result(first_batch, images_dataset.features.all, images_dataset.labels)
def test_should_save_image_correctly(thor_image_path, encoding): show = False if thor_image_path.endswith(".jpg"): from PIL import Image thor = Image.open(tf_helpers.get_string(thor_image_path)) else: thor = mpimg.imread(tf_helpers.get_string(thor_image_path)) image_arr = thor[None, :] if show: plt.imshow(image_arr.squeeze()) plt.title('before') plt.show() two_images = { consts.LEFT_FEATURE_IMAGE: image_arr, consts.RIGHT_FEATURE_IMAGE: image_arr } label_dict = gen.paired_labels_dict() tfrecord_full_path = preparing_data.save_to_tfrecord( two_images, label_dict, 'thor', gen.dataset_spec(encoding=encoding)) dataset = reading_tfrecords.assemble_dataset( tfrecord_full_path.parent, gen.dataset_spec(encoding=encoding)) left_images, _, _, _, _ = tf_helpers.unpack_first_batch(dataset) decoded_thor = left_images + 0.5 if show: plt.imshow(decoded_thor) plt.title('after') plt.show() assert np.squeeze(decoded_thor).shape == np.squeeze(image_arr).shape assert np.allclose(decoded_thor, image_arr)
def test_should_read_and_save_image_correctly(thor_image_path, resizing): show = False thor = mpimg.imread(tf_helpers.get_string(thor_image_path)) thor = skimage.img_as_float(thor) image_arr = thor[None, :] if show: plt.imshow(image_arr.squeeze()) plt.title('before') plt.show() features_as_paths = { consts.FEATURES: np.array([thor_image_path]), } labels = gen.unpaired_labels_dict() if resizing: shape = (100, 100, 3) else: shape = thor.shape dataset_desc = gen.dataset_desc( storage_method=DatasetStorageMethod.ON_DISC, image_dimensions=ImageDimensions(shape)) raw_dataset_fragment = RawDatasetFragment(features=image_arr, labels=np.array( list(labels.values()))) dataset_spec = gen.dataset_spec(description=dataset_desc, raw_dataset_fragment=raw_dataset_fragment, paired=False) tfrecord_full_path = preparing_data.save_to_tfrecord( features_as_paths, labels, 'thor', dataset_spec) dataset = reading_tfrecords.assemble_dataset(tfrecord_full_path.parent, dataset_spec) left_images, _ = tf_helpers.unpack_first_batch(dataset) decoded_thor = left_images + 0.5 if show: plt.imshow(decoded_thor) plt.title('after') plt.show() assert np.squeeze(decoded_thor).shape == shape if not resizing: assert np.allclose(decoded_thor, image_arr, rtol=1.e-1, atol=1.e-1)
def test_all_unpaired_dataset_providers_should_get_features_from_raw_data_provider( description, dataset_provider_cls_name): provider = dataset_provider_cls_name( FakeRawDataProvider(curated=True, description=description)) image_dims = provider.raw_data_provider.description.image_dimensions batch_size = 12 dataset_spec = gen.dataset_spec(description=description, type=DatasetType.TEST, with_excludes=False, encoding=False, paired=False) dataset = provider.supply_dataset(dataset_spec, batch_size=batch_size).take(100) images, labels = tf_helpers.unpack_first_batch(dataset) assert images.shape == (batch_size, *image_dims) assert labels.shape == (batch_size, )
def test_should_include_reduced_size_in_path(expected_size, should_image_size_be_reduced): images_dataset: DictsDataset paths_dataset: DictsDataset images_dataset, paths_dataset = gen.dicts_dataset(save_on_disc=True) dataset_desc = gen.dataset_desc( storage_method=DatasetStorageMethod.ON_DISC, image_dimensions=ImageDimensions(expected_size)) raw_dataset_fragment = testing_helpers.dicts_dataset_to_raw_dataset_fragment( images_dataset) dataset_spec = gen.dataset_spec(description=dataset_desc, raw_dataset_fragment=raw_dataset_fragment, paired=False) tfrecord_full_path = preparing_data.save_to_tfrecord( paths_dataset.features, paths_dataset.labels, 'data', dataset_spec) parts = tfrecord_full_path.parts if should_image_size_be_reduced: assert ("size_" + str(expected_size[0])) in parts else: assert_that(parts, not_(contains("size_" + str(expected_size[0]))))
import collections import pytest from hamcrest import assert_that, is_in from src.data.processing.generating_pairs import create_same_pairs, create_different_pairs, determine_class_size, \ get_random_element from testing_utils import gen DATASET_SPEC = gen.dataset_spec() MIN_PAIRS_NUM = 30 @pytest.mark.parametrize('min_pairs_num, actual_pair_num', [(15, 15), (20, 21), (1000, 1002)]) def test_should_create_correct_pair_number(number_translation_features_dict, min_pairs_num, actual_pair_num): pairs, same_labels = create_same_pairs(number_translation_features_dict, min_pairs_num, DATASET_SPEC) assert len(pairs) == len(same_labels) == actual_pair_num def test_should_create_same_pairs(number_translation_features_dict): pairs, _ = create_same_pairs(number_translation_features_dict, MIN_PAIRS_NUM, DATASET_SPEC) assert len(pairs) == MIN_PAIRS_NUM for left, right in pairs: assert left.number == right.number