def test_data_preprocess(self):
        img_resize = (16, 16)
        color_channels = 3  # RGB
        train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths()

        assert os.path.exists(train_jpeg_dir), "The {} folder does not exist".format(train_jpeg_dir)
        assert os.path.exists(test_jpeg_dir), "The {} folder does not exist".format(test_jpeg_dir)
        assert os.path.exists(test_jpeg_additional), "The {} file does not exist".format(test_jpeg_additional)
        assert os.path.exists(train_csv_file), "The {} file does not exist".format(train_csv_file)

        x_train, y_train, y_map = data_helper.preprocess_train_data(train_jpeg_dir, train_csv_file,
                                                                    img_resize=img_resize)

        x_test, _ = data_helper.preprocess_test_data(test_jpeg_dir, img_resize=img_resize)
        x_test_add, _ = data_helper.preprocess_test_data(test_jpeg_additional, img_resize=img_resize)

        labels_df = pd.read_csv(train_csv_file)
        labels_count = len(set(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values])))
        train_files_count = len(os.listdir(train_jpeg_dir))
        test_files_count = len(os.listdir(test_jpeg_dir))
        test_add_file_count = len(os.listdir(test_jpeg_additional))
        assert x_train.shape == (train_files_count, *img_resize, color_channels)
        assert x_test.shape == (test_files_count, *img_resize, color_channels)
        assert x_test_add.shape == (test_add_file_count, *img_resize, color_channels)
        assert y_train.shape == (train_files_count, labels_count)
Exemple #2
0
 def test_data_preprocess(self):
     img_resize = (16, 16)
     color_channels = 3  # RGB
     train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths(
     )
     x_train, x_test, y_train, y_map, x_test_filename = data_helper.preprocess_data(
         train_jpeg_dir, test_jpeg_dir, test_jpeg_additional,
         train_csv_file, img_resize)
     labels_df = pd.read_csv(train_csv_file)
     labels_count = len(
         set(
             chain.from_iterable(
                 [tags.split(" ") for tags in labels_df['tags'].values])))
     train_files_count = len(os.listdir(train_jpeg_dir))
     test_files_count = len(os.listdir(test_jpeg_dir)) + len(
         os.listdir(test_jpeg_additional))
     assert x_train.shape == (train_files_count, *img_resize,
                              color_channels)
     assert x_test.shape == (test_files_count, *img_resize, color_channels)
     assert y_train.shape == (train_files_count, labels_count)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import data_helper
#from keras_helper import AmazonKerasClassifier
from src.xception_classifier import XceptionClassifier

#img_resize = (64, 64) # The resize size of each image
img_resize = (74, 74)  # nova velikost potrebna pro XCeption model
validation_split_size = 0.2
epochs = 20
batch_size = 128

train_jpeg_dir, test_jpeg_dir, test_jpeg_additional, train_csv_file = data_helper.get_jpeg_data_files_paths(
)
labels_df = pd.read_csv(train_csv_file)
labels_df.head()

# Each image can be tagged with multiple tags, lets list all uniques tags

# In[5]:

# Print all unique tags
from itertools import chain
labels_list = list(
    chain.from_iterable([tags.split(" ")
                         for tags in labels_df['tags'].values]))
labels_set = set(labels_list)
print("There is {} unique labels including {}".format(len(labels_set),
                                                      labels_set))
Exemple #4
0
    item
    for sublist in list(df['tags'].apply(lambda row: row.split(" ")).values)
    for item in sublist
]
data_dist = []
data_dist.append([
    x[1] for x in pd.DataFrame({
        'tag': all_tags
    }).groupby('tag').size().reset_index().values.tolist()
])
data_weight = [1.0 / x for x in data_dist[0]]
den = sum(data_weight)
data_weight = [x / den for x in data_weight]

[train_jpeg_dir, test_jpeg_dir, test_jpeg_additional,
 train_csv_file] = get_jpeg_data_files_paths()
dataset = AmazonPreprocessor(train_jpeg_dir,
                             train_csv_file,
                             test_jpeg_dir,
                             test_jpeg_additional,
                             img_resize=(224, 224))
dataset.init()

initial_model = VGG16(include_top=False,
                      weights='imagenet',
                      input_shape=(224, 224, 3),
                      pooling='max')
model_out = initial_model.output
model_out = Dense(256,
                  activation='relu',
                  input_shape=initial_model.output_shape[1:])(model_out)
import data_helper
from keras_helper import AmazonKerasClassifier
from kaggle_data.downloader import KaggleDataDownloader

competition_name = "planet-understanding-the-amazon-from-space"

train, train_u = "train-jpg.tar.7z", "train-jpg.tar"
test, test_u = "test-jpg.tar.7z", "test-jpg.tar"
test_additional, test_additional_u = "test-jpg-additional.tar.7z", "test-jpg-additional.tar"
test_labels = "train_v2.csv.zip"
destination_path = "../input/"
is_datasets_present = False

# If the folders already exists then the files may already be extracted
# This is a bit hacky but it's sufficient for our needs
datasets_path = data_helper.get_jpeg_data_files_paths()
for dir_path in datasets_path:
    if os.path.exists(dir_path):
        is_datasets_present = True

if not is_datasets_present:
    # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively
    downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name)
    
    train_output_path = downloader.download_dataset(train, destination_path)
    downloader.decompress(train_output_path, destination_path) # Outputs a tar file
    downloader.decompress(destination_path + train_u, destination_path) # Extract the content of the previous tar file
    os.remove(train_output_path) # Removes the 7z file
    os.remove(destination_path + train_u) # Removes the tar file
    
    test_output_path = downloader.download_dataset(test, destination_path)