def test_data_preprocess(self): img_resize = (16, 16) color_channels = 3 # RGB train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths() assert os.path.exists(train_jpeg_dir), "The {} folder does not exist".format(train_jpeg_dir) assert os.path.exists(test_jpeg_dir), "The {} folder does not exist".format(test_jpeg_dir) assert os.path.exists(test_jpeg_additional), "The {} file does not exist".format(test_jpeg_additional) assert os.path.exists(train_csv_file), "The {} file does not exist".format(train_csv_file) x_train, y_train, y_map = data_helper.preprocess_train_data(train_jpeg_dir, train_csv_file, img_resize=img_resize) x_test, _ = data_helper.preprocess_test_data(test_jpeg_dir, img_resize=img_resize) x_test_add, _ = data_helper.preprocess_test_data(test_jpeg_additional, img_resize=img_resize) labels_df = pd.read_csv(train_csv_file) labels_count = len(set(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values]))) train_files_count = len(os.listdir(train_jpeg_dir)) test_files_count = len(os.listdir(test_jpeg_dir)) test_add_file_count = len(os.listdir(test_jpeg_additional)) assert x_train.shape == (train_files_count, *img_resize, color_channels) assert x_test.shape == (test_files_count, *img_resize, color_channels) assert x_test_add.shape == (test_add_file_count, *img_resize, color_channels) assert y_train.shape == (train_files_count, labels_count)
# # Data preprocessing # Preprocess the data in order to fit it into the Keras model. # # Due to the hudge amount of memory the resulting matrices will take, the preprocessing will be splitted into several steps: # - Preprocess training data (images and labels) and train the neural net with it # - Delete the training data and call the gc to free up memory # - Preprocess the first testing set # - Predict the first testing set labels # - Delete the first testing set # - Preprocess the second testing set # - Predict the second testing set labels and append them to the first testing set # - Delete the second testing set # In[9]: x_train, y_train, y_map = data_helper.preprocess_train_data( train_jpeg_dir, train_csv_file, img_resize) # Free up all available memory space after this heavy operation gc.collect() # In[10]: print("x_train shape: {}".format(x_train.shape)) print("y_train shape: {}".format(y_train.shape)) print(y_map) # ## Create the neural network definition # In[11]: # classifier = AmazonKerasClassifier() # classifier.add_conv_layer(img_resize)
else: print("All datasets are present.") train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths() labels_df = pd.read_csv(train_csv_file) from itertools import chain labels_list = list(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values])) labels_set = set(labels_list) print("There is {} unique labels including {}".format(len(labels_set), labels_set)) img_resize = (64, 64) # The resize size of each image validation_split_size = 0.2 batch_size = 128 x_train, y_train, y_map = data_helper.preprocess_train_data(train_jpeg_dir, train_csv_file, img_resize) # Free up all available memory space after this heavy operation gc.collect(); print("x_train shape: {}".format(x_train.shape)) print("y_train shape: {}".format(y_train.shape)) from tensorflow.contrib.keras.api.keras.callbacks import ModelCheckpoint filepath="weights.best.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True) classifier = AmazonKerasClassifier() classifier.add_conv_layer(img_resize) classifier.add_flatten_layer() classifier.add_ann_layer(len(y_map))