def make_dev_set() -> None: make_dir_if_needed(DEV_DIR) class_paths = get_sub_dirs(TRAIN_DIR) classes = [p.split("/")[-1] for p in class_paths] for i, class_path in enumerate(class_paths): # Get the file paths of all images for this class. images = get_dir_files(class_path) n_images = len(images) n_dev = int(n_images * DEV_RATIO) print( f"The {class_path} class will have {n_dev}/{n_images} dev images") # Randomly sample `n_dev` images for the dev set for this class. dev_images = random.sample(images, n_dev) dev_class_path = os.path.join(DEV_DIR, classes[i]) make_dir_if_needed(dev_class_path) for dev_image in dev_images: # Move this image to the dev set. os.replace( os.path.join(class_path, dev_image), os.path.join(dev_class_path, dev_image), )
def report_dataset_stats(dataset_path: str, dataset_name: str) -> None: class_paths = get_sub_dirs(dataset_path) stats = { class_dir.split("/")[-1]: len(get_dir_files(class_dir)) for class_dir in class_paths } print(f"{dataset_name} Set:") pprint(stats, indent=4)
def make_submission(model, image_size: int, index2class: dict) -> pd.DataFrame: test_X = get_test((image_size, image_size), "./test") test_probs = model.predict(test_X) test_preds = test_probs.argmax(axis=-1) test_fnames = sorted(get_dir_files("./test")) test_labels = [] for pred in test_preds: test_labels.append(index2class[str(pred)]) return pd.DataFrame({"file": test_fnames, "species": test_labels})
def move_all_to_train() -> None: """ Moves all the dev images back to the train set. """ class_paths = get_sub_dirs(DEV_DIR) classes = [p.split("/")[-1] for p in class_paths] for i, class_path in enumerate(class_paths): # Get the file paths of all images for this class. images = get_dir_files(class_path) train_class_path = os.path.join(TRAIN_DIR, classes[i]) for image in images: # Move this image to the train set. os.replace( os.path.join(class_path, image), os.path.join(train_class_path, image), )
def _from_raw(self, data_root_dir: str) -> None: self.class2index = {} self.index2class = {} self.classses = [] X, y = [], [] print(f"Loading dataset at root path: '{data_root_dir}'...") for i, class_path in enumerate(tqdm(get_sub_dirs(data_root_dir))): class_name = class_path.split("/")[-1] self.class2index[class_name] = i self.index2class[i] = class_name self.classses.append(class_name) for image_fname in get_dir_files(class_path): image = load_img(os.path.join(class_path, image_fname), target_size=self.img_size) image_arr = img_to_array(image) X.append(image_arr) y.append(i) self.X = np.array(X) / 255.0 # normalize the scale self.y = to_categorical(y) self.n_classes = len(self.classses)
def get_test(image_size: tuple, test_dir: str) -> None: h5path = f"test-{image_size}px.h5" if os.path.isfile(h5path): h5f = h5py.File(h5path, "r") X = h5f["X"][:] else: X = [] fnames = sorted(get_dir_files(test_dir)) for image_fname in fnames: image = load_img(os.path.join(test_dir, image_fname), target_size=image_size) image_arr = img_to_array(image) X.append(image_arr) X = np.array(X) / 255.0 # normalize the scale h5f = h5py.File(h5path, "w") h5f.create_dataset("X", data=X) h5f.close() return X