def test_data_preprocess(self):
        img_resize = (16, 16)
        color_channels = 3  # RGB
        train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths()

        assert os.path.exists(train_jpeg_dir), "The {} folder does not exist".format(train_jpeg_dir)
        assert os.path.exists(test_jpeg_dir), "The {} folder does not exist".format(test_jpeg_dir)
        assert os.path.exists(test_jpeg_additional), "The {} file does not exist".format(test_jpeg_additional)
        assert os.path.exists(train_csv_file), "The {} file does not exist".format(train_csv_file)

        x_train, y_train, y_map = data_helper.preprocess_train_data(train_jpeg_dir, train_csv_file,
                                                                    img_resize=img_resize)

        x_test, _ = data_helper.preprocess_test_data(test_jpeg_dir, img_resize=img_resize)
        x_test_add, _ = data_helper.preprocess_test_data(test_jpeg_additional, img_resize=img_resize)

        labels_df = pd.read_csv(train_csv_file)
        labels_count = len(set(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values])))
        train_files_count = len(os.listdir(train_jpeg_dir))
        test_files_count = len(os.listdir(test_jpeg_dir))
        test_add_file_count = len(os.listdir(test_jpeg_additional))
        assert x_train.shape == (train_files_count, *img_resize, color_channels)
        assert x_test.shape == (test_files_count, *img_resize, color_channels)
        assert x_test_add.shape == (test_add_file_count, *img_resize, color_channels)
        assert y_train.shape == (train_files_count, labels_count)
# # Data preprocessing
# Preprocess the data in order to fit it into the Keras model.
#
# Due to the hudge amount of memory the resulting matrices will take, the preprocessing will be splitted into several steps:
#     - Preprocess training data (images and labels) and train the neural net with it
#     - Delete the training data and call the gc to free up memory
#     - Preprocess the first testing set
#     - Predict the first testing set labels
#     - Delete the first testing set
#     - Preprocess the second testing set
#     - Predict the second testing set labels and append them to the first testing set
#     - Delete the second testing set

# In[9]:

x_train, y_train, y_map = data_helper.preprocess_train_data(
    train_jpeg_dir, train_csv_file, img_resize)
# Free up all available memory space after this heavy operation
gc.collect()

# In[10]:

print("x_train shape: {}".format(x_train.shape))
print("y_train shape: {}".format(y_train.shape))
print(y_map)

# ## Create the neural network definition

# In[11]:

# classifier = AmazonKerasClassifier()
# classifier.add_conv_layer(img_resize)
else:
    print("All datasets are present.")

train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths()
labels_df = pd.read_csv(train_csv_file)

from itertools import chain
labels_list = list(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values]))
labels_set = set(labels_list)
print("There is {} unique labels including {}".format(len(labels_set), labels_set))

img_resize = (64, 64) # The resize size of each image
validation_split_size = 0.2
batch_size = 128

x_train, y_train, y_map = data_helper.preprocess_train_data(train_jpeg_dir, train_csv_file, img_resize)
# Free up all available memory space after this heavy operation
gc.collect();

print("x_train shape: {}".format(x_train.shape))
print("y_train shape: {}".format(y_train.shape))

from tensorflow.contrib.keras.api.keras.callbacks import ModelCheckpoint

filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)

classifier = AmazonKerasClassifier()
classifier.add_conv_layer(img_resize)
classifier.add_flatten_layer()
classifier.add_ann_layer(len(y_map))