def make_dev_set() -> None:
    make_dir_if_needed(DEV_DIR)
    class_paths = get_sub_dirs(TRAIN_DIR)
    classes = [p.split("/")[-1] for p in class_paths]

    for i, class_path in enumerate(class_paths):
        # Get the file paths of all images for this class.
        images = get_dir_files(class_path)

        n_images = len(images)
        n_dev = int(n_images * DEV_RATIO)
        print(
            f"The {class_path} class will have {n_dev}/{n_images} dev images")

        # Randomly sample `n_dev` images for the dev set for this class.
        dev_images = random.sample(images, n_dev)
        dev_class_path = os.path.join(DEV_DIR, classes[i])
        make_dir_if_needed(dev_class_path)

        for dev_image in dev_images:
            # Move this image to the dev set.
            os.replace(
                os.path.join(class_path, dev_image),
                os.path.join(dev_class_path, dev_image),
            )
def report_dataset_stats(dataset_path: str, dataset_name: str) -> None:
    class_paths = get_sub_dirs(dataset_path)
    stats = {
        class_dir.split("/")[-1]: len(get_dir_files(class_dir))
        for class_dir in class_paths
    }
    print(f"{dataset_name} Set:")
    pprint(stats, indent=4)
Exemple #3
0
def make_submission(model, image_size: int, index2class: dict) -> pd.DataFrame:
    test_X = get_test((image_size, image_size), "./test")
    test_probs = model.predict(test_X)
    test_preds = test_probs.argmax(axis=-1)
    test_fnames = sorted(get_dir_files("./test"))
    test_labels = []
    for pred in test_preds:
        test_labels.append(index2class[str(pred)])
    return pd.DataFrame({"file": test_fnames, "species": test_labels})
def move_all_to_train() -> None:
    """
    Moves all the dev images back to the train set.
    """
    class_paths = get_sub_dirs(DEV_DIR)
    classes = [p.split("/")[-1] for p in class_paths]
    for i, class_path in enumerate(class_paths):
        # Get the file paths of all images for this class.
        images = get_dir_files(class_path)
        train_class_path = os.path.join(TRAIN_DIR, classes[i])

        for image in images:
            # Move this image to the train set.
            os.replace(
                os.path.join(class_path, image),
                os.path.join(train_class_path, image),
            )
    def _from_raw(self, data_root_dir: str) -> None:
        self.class2index = {}
        self.index2class = {}
        self.classses = []
        X, y = [], []

        print(f"Loading dataset at root path: '{data_root_dir}'...")
        for i, class_path in enumerate(tqdm(get_sub_dirs(data_root_dir))):
            class_name = class_path.split("/")[-1]
            self.class2index[class_name] = i
            self.index2class[i] = class_name
            self.classses.append(class_name)

            for image_fname in get_dir_files(class_path):
                image = load_img(os.path.join(class_path, image_fname),
                                 target_size=self.img_size)
                image_arr = img_to_array(image)
                X.append(image_arr)
                y.append(i)

        self.X = np.array(X) / 255.0  # normalize the scale
        self.y = to_categorical(y)
        self.n_classes = len(self.classses)
Exemple #6
0
def get_test(image_size: tuple, test_dir: str) -> None:
    h5path = f"test-{image_size}px.h5"

    if os.path.isfile(h5path):
        h5f = h5py.File(h5path, "r")
        X = h5f["X"][:]
    else:
        X = []

        fnames = sorted(get_dir_files(test_dir))
        for image_fname in fnames:
            image = load_img(os.path.join(test_dir, image_fname),
                             target_size=image_size)
            image_arr = img_to_array(image)
            X.append(image_arr)

        X = np.array(X) / 255.0  # normalize the scale

        h5f = h5py.File(h5path, "w")
        h5f.create_dataset("X", data=X)
        h5f.close()

    return X