def get_cat_dogs_dataset( dirs: str = "/app/data/data_cat_dogs/*", extension: str = "*.jpg", test_size: float = 0.2, random_state: int = 42, tag_file_path: tp.Optional[str] = None, ) -> tp.Tuple[tp.Dict[str, tp.Any], tp.Dict[str, tp.Any], int]: dataset = utils.create_dataset(dirs=dirs, extension=extension) df = utils.create_dataframe(dataset, columns=["class", "filepath"]) tag_to_label = utils.get_dataset_labeling(df, "class") if tag_file_path is not None: with open(tag_file_path, "w") as file: json.dump(tag_to_label, file) df_with_labels = utils.map_dataframe( df, tag_column="class", class_column="label", tag2class=tag_to_label, verbose=False, ) train_data, valid_data = utils.split_dataframe_train_test( df_with_labels, test_size=test_size, random_state=random_state) return ( train_data.to_dict("records"), valid_data.to_dict("records"), len(tag_to_label), )
def prepare_splits(args): tag2class = dict(safitty.load(args.labeling)) df_with_labels = map_dataframe(pd.read_csv(args.df), tag_column="class", class_column="label", tag2class=tag2class, verbose=False) train_data, val_data = train_test_split(df_with_labels, random_state=args.seed, test_size=args.test) train_data.to_csv(os.path.join(args.out_path, 'train.csv'), index=False) val_data.to_csv(os.path.join(args.out_path, 'valid.csv'), index=False)