def test_data_preprocess(self): img_resize = (16, 16) color_channels = 3 # RGB train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths() assert os.path.exists(train_jpeg_dir), "The {} folder does not exist".format(train_jpeg_dir) assert os.path.exists(test_jpeg_dir), "The {} folder does not exist".format(test_jpeg_dir) assert os.path.exists(test_jpeg_additional), "The {} file does not exist".format(test_jpeg_additional) assert os.path.exists(train_csv_file), "The {} file does not exist".format(train_csv_file) x_train, y_train, y_map = data_helper.preprocess_train_data(train_jpeg_dir, train_csv_file, img_resize=img_resize) x_test, _ = data_helper.preprocess_test_data(test_jpeg_dir, img_resize=img_resize) x_test_add, _ = data_helper.preprocess_test_data(test_jpeg_additional, img_resize=img_resize) labels_df = pd.read_csv(train_csv_file) labels_count = len(set(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values]))) train_files_count = len(os.listdir(train_jpeg_dir)) test_files_count = len(os.listdir(test_jpeg_dir)) test_add_file_count = len(os.listdir(test_jpeg_additional)) assert x_train.shape == (train_files_count, *img_resize, color_channels) assert x_test.shape == (test_files_count, *img_resize, color_channels) assert x_test_add.shape == (test_add_file_count, *img_resize, color_channels) assert y_train.shape == (train_files_count, labels_count)
def test_data_preprocess(self): img_resize = (16, 16) color_channels = 3 # RGB train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths( ) x_train, x_test, y_train, y_map, x_test_filename = data_helper.preprocess_data( train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file, img_resize) labels_df = pd.read_csv(train_csv_file) labels_count = len( set( chain.from_iterable( [tags.split(" ") for tags in labels_df['tags'].values]))) train_files_count = len(os.listdir(train_jpeg_dir)) test_files_count = len(os.listdir(test_jpeg_dir)) + len( os.listdir(test_jpeg_additional)) assert x_train.shape == (train_files_count, *img_resize, color_channels) assert x_test.shape == (test_files_count, *img_resize, color_channels) assert y_train.shape == (train_files_count, labels_count)
import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import data_helper #from keras_helper import AmazonKerasClassifier from src.xception_classifier import XceptionClassifier #img_resize = (64, 64) # The resize size of each image img_resize = (74, 74) # nova velikost potrebna pro XCeption model validation_split_size = 0.2 epochs = 20 batch_size = 128 train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths( ) labels_df = pd.read_csv(train_csv_file) labels_df.head() # Each image can be tagged with multiple tags, lets list all uniques tags # In[5]: # Print all unique tags from itertools import chain labels_list = list( chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values])) labels_set = set(labels_list) print("There is {} unique labels including {}".format(len(labels_set), labels_set))
item for sublist in list(df['tags'].apply(lambda row: row.split(" ")).values) for item in sublist ] data_dist = [] data_dist.append([ x[1] for x in pd.DataFrame({ 'tag': all_tags }).groupby('tag').size().reset_index().values.tolist() ]) data_weight = [1.0 / x for x in data_dist[0]] den = sum(data_weight) data_weight = [x / den for x in data_weight] [train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file] = get_jpeg_data_files_paths() dataset = AmazonPreprocessor(train_jpeg_dir, train_csv_file, test_jpeg_dir, test_jpeg_additional, img_resize=(224, 224)) dataset.init() initial_model = VGG16(include_top=False, weights='imagenet', input_shape=(224, 224, 3), pooling='max') model_out = initial_model.output model_out = Dense(256, activation='relu', input_shape=initial_model.output_shape[1:])(model_out)
import data_helper from keras_helper import AmazonKerasClassifier from kaggle_data.downloader import KaggleDataDownloader competition_name = "planet-understanding-the-amazon-from-space" train, train_u = "train-jpg.tar.7z", "train-jpg.tar" test, test_u = "test-jpg.tar.7z", "test-jpg.tar" test_additional, test_additional_u = "test-jpg-additional.tar.7z", "test-jpg-additional.tar" test_labels = "train_v2.csv.zip" destination_path = "../input/" is_datasets_present = False # If the folders already exists then the files may already be extracted # This is a bit hacky but it's sufficient for our needs datasets_path = data_helper.get_jpeg_data_files_paths() for dir_path in datasets_path: if os.path.exists(dir_path): is_datasets_present = True if not is_datasets_present: # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name) train_output_path = downloader.download_dataset(train, destination_path) downloader.decompress(train_output_path, destination_path) # Outputs a tar file downloader.decompress(destination_path + train_u, destination_path) # Extract the content of the previous tar file os.remove(train_output_path) # Removes the 7z file os.remove(destination_path + train_u) # Removes the tar file test_output_path = downloader.download_dataset(test, destination_path)