Beispiel #1
0
def get_cat_dogs_dataset(
    dirs: str = "/app/data/data_cat_dogs/*",
    extension: str = "*.jpg",
    test_size: float = 0.2,
    random_state: int = 42,
    tag_file_path: tp.Optional[str] = None,
) -> tp.Tuple[tp.Dict[str, tp.Any], tp.Dict[str, tp.Any], int]:
    dataset = utils.create_dataset(dirs=dirs, extension=extension)
    df = utils.create_dataframe(dataset, columns=["class", "filepath"])

    tag_to_label = utils.get_dataset_labeling(df, "class")
    if tag_file_path is not None:
        with open(tag_file_path, "w") as file:
            json.dump(tag_to_label, file)

    df_with_labels = utils.map_dataframe(
        df,
        tag_column="class",
        class_column="label",
        tag2class=tag_to_label,
        verbose=False,
    )

    train_data, valid_data = utils.split_dataframe_train_test(
        df_with_labels, test_size=test_size, random_state=random_state)
    return (
        train_data.to_dict("records"),
        valid_data.to_dict("records"),
        len(tag_to_label),
    )
Beispiel #2
0
def main(args, _=None):
    if args.in_csv is not None:
        df = pd.read_csv(args.in_csv)
    elif args.in_dir is not None:
        df = _prepare_df_from_dirs(args.in_dir,
                                   args.tag_column,
                                   recursive=args.recursive)
    else:
        raise Exception

    if args.tag_delim is not None:
        df = separate_tags(df,
                           tag_column=args.tag_column,
                           tag_delim=args.tag_delim)

    tag2lbl = get_dataset_labeling(df, args.tag_column)
    print("Num classes: ", len(tag2lbl))

    with open(args.out_labeling, "w") as fout:
        json.dump(tag2lbl, fout, indent=4)

    if args.out_dataset is not None:
        df.to_csv(args.out_dataset, index=False)