def copy_divisions(divisions, src_dir: str, dest_dir: str): for div, div_name in zip(divisions, list(string.ascii_uppercase)): for cl in div: cl_src = os.path.join(src_dir, cl) cl_dest = os.path.join(dest_dir, div_name, cl) cl_data = glob.glob(os.path.join(cl_src, '*')) transfer_datapoints(cl_dest, cl_src, cl_data)
def process_dataset(self, raw_dataset_dir, dataset_name): class_filter = os.path.join(raw_dataset_dir, self.class_name_filter) class_list = glob.glob(class_filter) num_classes_to_use = self.max_num_classes filtered_dataset_output = os.path.join(self.output_dataset_dir, dataset_name) assert len(class_list) >= self.min_num_classes if self.max_num_classes > len(class_list) or self.max_num_classes == 0: num_classes_to_use = len(class_list) min_data_point = sum(self.phase_size_dict.values()) for i in range(num_classes_to_use): class_name = os.path.basename(class_list[i]) class_dir_path = class_list[i] data_points = glob.glob( os.path.join(class_dir_path, self.data_name_filter)) num_datapoints = len(data_points) if num_datapoints >= min_data_point: reduced_data = data_points # for each phase we choose a specific amount of datapoint for every id, then remove the data points we # use for selection of the next phase for phase in self.phase_size_dict.keys(): phase_data = np.random.choice(data_points, self.phase_size_dict[phase], replace=False) transfer_datapoints(filtered_dataset_output, phase, class_name, phase_data) reduced_data = np.setdiff1d(reduced_data, phase_data) return filtered_dataset_output
def process_dataset(self, raw_dataset_dir, dataset_name): num_classes = 0 class_dirs = glob.glob(os.path.join(raw_dataset_dir, '*')) dest_ds_dir = os.path.join(self.__dest_root_dir, dataset_name + ' min_size=' + str(self.__min_size)) if os.path.exists(dest_ds_dir): return dest_ds_dir, glob.glob(os.path.join(dest_ds_dir, '*')) for dir in tqdm.tqdm(class_dirs, desc='select classes by size'): datapoints = glob.glob(os.path.join(dir, '*')) if len(datapoints) >= self.__min_size: num_classes += 1 cl = os.path.basename(dir) dest_cl_dir = os.path.join(dest_ds_dir, cl) transfer_datapoints(dest_cl_dir, dir, datapoints) return dest_ds_dir, num_classes
def process_dataset(self, raw_dataset_dir, dataset_name): num_classes = 0 class_dirs = glob.glob(os.path.join(raw_dataset_dir, 'train', '*')) dest_ds_dir = os.path.join(self.__output_dataset_dir, dataset_name + ' white_list') if os.path.exists(dest_ds_dir): return dest_ds_dir, glob.glob(os.path.join(dest_ds_dir, '*"')) for dir in tqdm.tqdm(class_dirs, desc='white list'): cl = os.path.basename(dir) if cl in self.__white_list: train_datapoints = glob.glob(os.path.join(dir, '*')) num_classes += 1 cl = os.path.basename(dir) dest_cl_dir = os.path.join(dest_ds_dir, cl) transfer_datapoints(dest_cl_dir, dir, train_datapoints) return dest_ds_dir, num_classes
def process_dataset(self, raw_dataset_dir, dataset_name): print('Running phase percentage size filter for:') print(f'dir: {raw_dataset_dir}') class_filter = os.path.join(raw_dataset_dir, self.class_name_filter) class_list = glob.glob(class_filter) filtered_dataset_output = self.__get_output_path(dataset_name) if os.path.exists(filtered_dataset_output): return filtered_dataset_output, len( glob.glob( os.path.join(filtered_dataset_output, list(self.phase_percentage_dict.keys())[0], self.class_name_filter))) max_classes = len(class_list) for i in range(max_classes): class_name = os.path.basename(class_list[i]) class_dir_path = class_list[i] data_points = glob.glob( os.path.join(class_dir_path, self.data_name_filter)) num_datapoints = len(data_points) reduced_data = data_points # for each phase we choose a specific amount of datapoint for every id, then remove the data points we # use for selection of the next phase for phase in self.phase_percentage_dict.keys(): phase_perc = self.phase_percentage_dict[phase] phase_size = int(phase_perc * num_datapoints) phase_data = np.random.choice(reduced_data, phase_size, replace=False) dest_dir = os.path.join(filtered_dataset_output, phase) os.makedirs(os.path.dirname(dest_dir), exist_ok=True) transfer_datapoints(dest_dir, raw_dataset_dir, phase_data) reduced_data = np.setdiff1d(reduced_data, phase_data) num_classes_to_use = len( glob.glob( os.path.join(filtered_dataset_output, list(self.phase_percentage_dict.keys())[0], self.class_name_filter))) return filtered_dataset_output, num_classes_to_use
def process_dataset(self, raw_dataset_dir, dataset_name): class_filter = os.path.join(raw_dataset_dir, self.class_name_filter) class_list = glob.glob(class_filter) filtered_dataset_output = self.__get_output_path(dataset_name) if os.path.exists(filtered_dataset_output): return filtered_dataset_output, glob.glob( os.path.join(filtered_dataset_output, list(self.phase_size_dict.keys())[0], self.class_name_filter)) max_classes = len(class_list) min_data_point = sum(self.phase_size_dict.values()) num_classes_to_use = 0 for i in tqdm(range(max_classes), desc='phase size filter'): class_name = os.path.basename(class_list[i]) class_dir_path = class_list[i] data_points = glob.glob( os.path.join(class_dir_path, self.data_name_filter)) num_datapoints = len(data_points) if num_datapoints >= min_data_point: num_classes_to_use += 1 reduced_data = data_points # for each phase we choose a specific amount of datapoint for every id, then remove the data points we # use for selection of the next phase for phase in self.phase_size_dict.keys(): phase_data = np.random.choice(reduced_data, self.phase_size_dict[phase], replace=False) dest_dir = os.path.join(filtered_dataset_output, phase) os.makedirs(os.path.dirname(dest_dir), exist_ok=True) transfer_datapoints(dest_dir, raw_dataset_dir, phase_data) reduced_data = np.setdiff1d(reduced_data, phase_data) return filtered_dataset_output, num_classes_to_use
def process_dataset(self, raw_dataset_dir, dataset_name): class_list = self.__select_classes(raw_dataset_dir) filtered_dataset_output = self.__get_output_path(dataset_name) if os.path.exists(filtered_dataset_output): return filtered_dataset_output, glob.glob( os.path.join(filtered_dataset_output, list(self.phase_size_dict.keys())[0], self.class_name_filter)) num_classes_to_use = 0 for phase in self.phase_size_dict: for cl in class_list: data_points = glob.glob( os.path.join(raw_dataset_dir, phase, cl, self.data_name_filter)) dest_dir = os.path.join(filtered_dataset_output, phase) transfer_datapoints(dest_dir, raw_dataset_dir, data_points) return filtered_dataset_output, num_classes_to_use
def process_dataset(self, raw_dataset_dir, dataset_name): num_classes = 0 class_dirs = glob.glob(os.path.join(raw_dataset_dir, '*')) dest_ds_dir = os.path.join(self.__output_dataset_dir, dataset_name + ' exclude_from_list') if os.path.exists(dest_ds_dir): return dest_ds_dir, glob.glob(os.path.join(dest_ds_dir, '*"')) for dir in tqdm.tqdm(class_dirs, desc='excldue classes by list'): cl = str(int(os.path.basename(dir).replace('n', ''))) if cl not in self.__exclusions: datapoints = glob.glob(os.path.join(dir, '*')) num_classes += 1 cl = os.path.basename(dir) dest_cl_dir = os.path.join(dest_ds_dir, cl) transfer_datapoints(dest_cl_dir, dir, datapoints) elif cl in self.__exclusions: print('found ', cl, ' in exclusions') return dest_ds_dir, num_classes
def process_dataset(self, raw_dataset_dir, dataset_name): dir_depth = (self.__depth * ['*']) class_filter = os.path.join(raw_dataset_dir, *dir_depth, self.__class_name_filter) classes_paths = np.array(glob.glob(class_filter)) classes = [os.path.basename(cl) for cl in classes_paths] class_names = np.unique(classes, 0) # print(class_names) num_classes_to_use = self.__max_num_classes assert len(class_names) >= self.__min_num_classes if self.__max_num_classes > len( class_names) or self.__max_num_classes == 0: num_classes_to_use = class_names.shape[0] classes_to_use = np.random.choice(class_names, num_classes_to_use, replace=False) filtered_dataset_output = os.path.join( self.__output_dataset_dir, f'{dataset_name}_num-classes_{num_classes_to_use}') if not os.path.exists(filtered_dataset_output): for i in tqdm.tqdm(range(num_classes_to_use), desc='num class filter'): class_dir_paths = glob.glob( os.path.join(raw_dataset_dir, *dir_depth, classes_to_use[i])) for class_path in class_dir_paths: data_points = glob.glob( os.path.join(class_path, self.__data_name_filter)) transfer_datapoints(filtered_dataset_output, raw_dataset_dir, data_points) return filtered_dataset_output, num_classes_to_use
class_names = [os.path.basename(cl) for cl in train_classes] num_classes = 0 min_val_datapoints = 0 for cl in tqdm.tqdm(class_names, desc='classes'): # Checking we have enough train train_cl_source = os.path.join(existing_ds_root, 'train', cl) train_data_points = glob.glob(os.path.join(train_cl_source, '*')) if len(train_data_points) >= min_train_datapoints: # Checking we have enough val val_cl_source = os.path.join(existing_ds_root, 'val', cl) val_data_points = glob.glob(os.path.join(val_cl_source, '*')) if len(val_data_points) >= min_val_datapoints: num_classes += 1 # transferring class train ds train_dest = os.path.join(dest_root, 'train', cl) os.makedirs(train_dest) transfer_datapoints(train_dest, train_cl_source, train_data_points) # transferring class val ds val_dest = os.path.join(dest_root, 'val', cl) os.makedirs(val_dest) transfer_datapoints(val_dest, val_cl_source, val_data_points) if num_classes == max_classes: break print('Moved ', num_classes, ' to ', dest_root) print('Done.')