def copy_divisions(divisions, src_dir: str, dest_dir: str):
    for div, div_name in zip(divisions, list(string.ascii_uppercase)):
        for cl in div:
            cl_src = os.path.join(src_dir, cl)
            cl_dest = os.path.join(dest_dir, div_name, cl)
            cl_data = glob.glob(os.path.join(cl_src, '*'))
            transfer_datapoints(cl_dest, cl_src, cl_data)
Beispiel #2
0
    def process_dataset(self, raw_dataset_dir, dataset_name):
        class_filter = os.path.join(raw_dataset_dir, self.class_name_filter)
        class_list = glob.glob(class_filter)
        num_classes_to_use = self.max_num_classes

        filtered_dataset_output = os.path.join(self.output_dataset_dir,
                                               dataset_name)

        assert len(class_list) >= self.min_num_classes
        if self.max_num_classes > len(class_list) or self.max_num_classes == 0:
            num_classes_to_use = len(class_list)

        min_data_point = sum(self.phase_size_dict.values())

        for i in range(num_classes_to_use):
            class_name = os.path.basename(class_list[i])
            class_dir_path = class_list[i]
            data_points = glob.glob(
                os.path.join(class_dir_path, self.data_name_filter))
            num_datapoints = len(data_points)

            if num_datapoints >= min_data_point:
                reduced_data = data_points
                # for each phase we choose a specific amount of datapoint for every id, then remove the data points we
                #     use for selection of the next phase
                for phase in self.phase_size_dict.keys():
                    phase_data = np.random.choice(data_points,
                                                  self.phase_size_dict[phase],
                                                  replace=False)
                    transfer_datapoints(filtered_dataset_output, phase,
                                        class_name, phase_data)
                    reduced_data = np.setdiff1d(reduced_data, phase_data)

        return filtered_dataset_output
    def process_dataset(self, raw_dataset_dir, dataset_name):
        num_classes = 0
        class_dirs = glob.glob(os.path.join(raw_dataset_dir, '*'))

        dest_ds_dir = os.path.join(self.__dest_root_dir, dataset_name + ' min_size=' + str(self.__min_size))

        if os.path.exists(dest_ds_dir):
            return dest_ds_dir, glob.glob(os.path.join(dest_ds_dir, '*'))
        for dir in tqdm.tqdm(class_dirs, desc='select classes by size'):
            datapoints = glob.glob(os.path.join(dir, '*'))
            if len(datapoints) >= self.__min_size:
                num_classes += 1
                cl = os.path.basename(dir)
                dest_cl_dir = os.path.join(dest_ds_dir, cl)
                transfer_datapoints(dest_cl_dir, dir, datapoints)

        return dest_ds_dir, num_classes
Beispiel #4
0
    def process_dataset(self, raw_dataset_dir, dataset_name):
        num_classes = 0
        class_dirs = glob.glob(os.path.join(raw_dataset_dir, 'train', '*'))
        dest_ds_dir = os.path.join(self.__output_dataset_dir, dataset_name + ' white_list')

        if os.path.exists(dest_ds_dir):
            return dest_ds_dir, glob.glob(os.path.join(dest_ds_dir, '*"'))

        for dir in tqdm.tqdm(class_dirs, desc='white list'):
            cl = os.path.basename(dir)
            if cl in self.__white_list:
                train_datapoints = glob.glob(os.path.join(dir, '*'))
                num_classes += 1
                cl = os.path.basename(dir)
                dest_cl_dir = os.path.join(dest_ds_dir, cl)
                transfer_datapoints(dest_cl_dir, dir, train_datapoints)

        return dest_ds_dir, num_classes
    def process_dataset(self, raw_dataset_dir, dataset_name):
        print('Running phase percentage size filter for:')
        print(f'dir: {raw_dataset_dir}')
        class_filter = os.path.join(raw_dataset_dir, self.class_name_filter)
        class_list = glob.glob(class_filter)

        filtered_dataset_output = self.__get_output_path(dataset_name)

        if os.path.exists(filtered_dataset_output):
            return filtered_dataset_output, len(
                glob.glob(
                    os.path.join(filtered_dataset_output,
                                 list(self.phase_percentage_dict.keys())[0],
                                 self.class_name_filter)))

        max_classes = len(class_list)

        for i in range(max_classes):
            class_name = os.path.basename(class_list[i])
            class_dir_path = class_list[i]
            data_points = glob.glob(
                os.path.join(class_dir_path, self.data_name_filter))
            num_datapoints = len(data_points)
            reduced_data = data_points
            # for each phase we choose a specific amount of datapoint for every id, then remove the data points we
            #     use for selection of the next phase
            for phase in self.phase_percentage_dict.keys():
                phase_perc = self.phase_percentage_dict[phase]
                phase_size = int(phase_perc * num_datapoints)
                phase_data = np.random.choice(reduced_data,
                                              phase_size,
                                              replace=False)
                dest_dir = os.path.join(filtered_dataset_output, phase)
                os.makedirs(os.path.dirname(dest_dir), exist_ok=True)
                transfer_datapoints(dest_dir, raw_dataset_dir, phase_data)
                reduced_data = np.setdiff1d(reduced_data, phase_data)

        num_classes_to_use = len(
            glob.glob(
                os.path.join(filtered_dataset_output,
                             list(self.phase_percentage_dict.keys())[0],
                             self.class_name_filter)))

        return filtered_dataset_output, num_classes_to_use
    def process_dataset(self, raw_dataset_dir, dataset_name):
        class_filter = os.path.join(raw_dataset_dir, self.class_name_filter)
        class_list = glob.glob(class_filter)

        filtered_dataset_output = self.__get_output_path(dataset_name)

        if os.path.exists(filtered_dataset_output):
            return filtered_dataset_output, glob.glob(
                os.path.join(filtered_dataset_output,
                             list(self.phase_size_dict.keys())[0],
                             self.class_name_filter))

        max_classes = len(class_list)

        min_data_point = sum(self.phase_size_dict.values())

        num_classes_to_use = 0

        for i in tqdm(range(max_classes), desc='phase size filter'):
            class_name = os.path.basename(class_list[i])
            class_dir_path = class_list[i]
            data_points = glob.glob(
                os.path.join(class_dir_path, self.data_name_filter))

            num_datapoints = len(data_points)

            if num_datapoints >= min_data_point:
                num_classes_to_use += 1
                reduced_data = data_points
                # for each phase we choose a specific amount of datapoint for every id, then remove the data points we
                #     use for selection of the next phase
                for phase in self.phase_size_dict.keys():
                    phase_data = np.random.choice(reduced_data,
                                                  self.phase_size_dict[phase],
                                                  replace=False)
                    dest_dir = os.path.join(filtered_dataset_output, phase)
                    os.makedirs(os.path.dirname(dest_dir), exist_ok=True)
                    transfer_datapoints(dest_dir, raw_dataset_dir, phase_data)
                    reduced_data = np.setdiff1d(reduced_data, phase_data)

        return filtered_dataset_output, num_classes_to_use
Beispiel #7
0
    def process_dataset(self, raw_dataset_dir, dataset_name):
        class_list = self.__select_classes(raw_dataset_dir)

        filtered_dataset_output = self.__get_output_path(dataset_name)

        if os.path.exists(filtered_dataset_output):
            return filtered_dataset_output, glob.glob(
                os.path.join(filtered_dataset_output,
                             list(self.phase_size_dict.keys())[0],
                             self.class_name_filter))

        num_classes_to_use = 0
        for phase in self.phase_size_dict:
            for cl in class_list:
                data_points = glob.glob(
                    os.path.join(raw_dataset_dir, phase, cl,
                                 self.data_name_filter))
                dest_dir = os.path.join(filtered_dataset_output, phase)
                transfer_datapoints(dest_dir, raw_dataset_dir, data_points)

        return filtered_dataset_output, num_classes_to_use
    def process_dataset(self, raw_dataset_dir, dataset_name):
        num_classes = 0
        class_dirs = glob.glob(os.path.join(raw_dataset_dir, '*'))
        dest_ds_dir = os.path.join(self.__output_dataset_dir,
                                   dataset_name + ' exclude_from_list')

        if os.path.exists(dest_ds_dir):
            return dest_ds_dir, glob.glob(os.path.join(dest_ds_dir, '*"'))

        for dir in tqdm.tqdm(class_dirs, desc='excldue classes by list'):
            cl = str(int(os.path.basename(dir).replace('n', '')))
            if cl not in self.__exclusions:
                datapoints = glob.glob(os.path.join(dir, '*'))
                num_classes += 1
                cl = os.path.basename(dir)
                dest_cl_dir = os.path.join(dest_ds_dir, cl)
                transfer_datapoints(dest_cl_dir, dir, datapoints)
            elif cl in self.__exclusions:
                print('found ', cl, ' in exclusions')

        return dest_ds_dir, num_classes
    def process_dataset(self, raw_dataset_dir, dataset_name):
        dir_depth = (self.__depth * ['*'])
        class_filter = os.path.join(raw_dataset_dir, *dir_depth,
                                    self.__class_name_filter)
        classes_paths = np.array(glob.glob(class_filter))

        classes = [os.path.basename(cl) for cl in classes_paths]
        class_names = np.unique(classes, 0)
        # print(class_names)
        num_classes_to_use = self.__max_num_classes

        assert len(class_names) >= self.__min_num_classes
        if self.__max_num_classes > len(
                class_names) or self.__max_num_classes == 0:
            num_classes_to_use = class_names.shape[0]

        classes_to_use = np.random.choice(class_names,
                                          num_classes_to_use,
                                          replace=False)

        filtered_dataset_output = os.path.join(
            self.__output_dataset_dir,
            f'{dataset_name}_num-classes_{num_classes_to_use}')

        if not os.path.exists(filtered_dataset_output):
            for i in tqdm.tqdm(range(num_classes_to_use),
                               desc='num class filter'):
                class_dir_paths = glob.glob(
                    os.path.join(raw_dataset_dir, *dir_depth,
                                 classes_to_use[i]))
                for class_path in class_dir_paths:
                    data_points = glob.glob(
                        os.path.join(class_path, self.__data_name_filter))

                    transfer_datapoints(filtered_dataset_output,
                                        raw_dataset_dir, data_points)

        return filtered_dataset_output, num_classes_to_use
Beispiel #10
0
    class_names = [os.path.basename(cl) for cl in train_classes]
    num_classes = 0
    min_val_datapoints = 0

    for cl in tqdm.tqdm(class_names, desc='classes'):
        # Checking we have enough train
        train_cl_source = os.path.join(existing_ds_root, 'train', cl)
        train_data_points = glob.glob(os.path.join(train_cl_source, '*'))
        if len(train_data_points) >= min_train_datapoints:
            # Checking we have enough val
            val_cl_source = os.path.join(existing_ds_root, 'val', cl)
            val_data_points = glob.glob(os.path.join(val_cl_source, '*'))
            if len(val_data_points) >= min_val_datapoints:
                num_classes += 1
                # transferring class train ds
                train_dest = os.path.join(dest_root, 'train', cl)
                os.makedirs(train_dest)
                transfer_datapoints(train_dest, train_cl_source,
                                    train_data_points)

                # transferring class val ds
                val_dest = os.path.join(dest_root, 'val', cl)
                os.makedirs(val_dest)
                transfer_datapoints(val_dest, val_cl_source, val_data_points)

                if num_classes == max_classes:
                    break

    print('Moved ', num_classes, ' to ', dest_root)
    print('Done.')