def _prepare_df_from_dirs(in_dirs, tag_column_name, recursive: bool = False): dfs = [] splitted_dirs = in_dirs.strip(",").split(",") def process_fn(x): if len(splitted_dirs) == 1: # remove all in_dir part from path return x.replace(f"{in_dir}", "") else: # leaves last part of in_dir path, # which identifies separate in_dir return x.replace(f"{in_dir}", f"{in_dir.split('/')[-2]}/") for in_dir in splitted_dirs: if not in_dir.endswith("/"): in_dir = f"{in_dir}/" dataset = create_dataset(f"{in_dir}/**", process_fn=process_fn, recursive=recursive) dfs.append( create_dataframe(dataset, columns=[tag_column_name, "filepath"])) df = pd.concat(dfs).reset_index(drop=True) return df
def get_cat_dogs_dataset( dirs: str = "/app/data/data_cat_dogs/*", extension: str = "*.jpg", test_size: float = 0.2, random_state: int = 42, tag_file_path: tp.Optional[str] = None, ) -> tp.Tuple[tp.Dict[str, tp.Any], tp.Dict[str, tp.Any], int]: dataset = utils.create_dataset(dirs=dirs, extension=extension) df = utils.create_dataframe(dataset, columns=["class", "filepath"]) tag_to_label = utils.get_dataset_labeling(df, "class") if tag_file_path is not None: with open(tag_file_path, "w") as file: json.dump(tag_to_label, file) df_with_labels = utils.map_dataframe( df, tag_column="class", class_column="label", tag2class=tag_to_label, verbose=False, ) train_data, valid_data = utils.split_dataframe_train_test( df_with_labels, test_size=test_size, random_state=random_state) return ( train_data.to_dict("records"), valid_data.to_dict("records"), len(tag_to_label), )