def assert_conversion(tag,
                      num_samples=None,
                      num_classes=None,
                      image_shape=None,
                      max_image_shape=None):
    """
    tries to create a dataset for the kaggle_tag & then convert it into hub format.
    """

    dataset_store = get_dataset_store(tag)
    hub_dir = dataset_store / "hub"

    # delete hub dataset so conversion test can be done
    if hub_dir.is_dir():
        print("hub_dir was found (%s), deleting..." % hub_dir)
        shutil.rmtree(hub_dir)

    try:
        ds = hub.Dataset.from_path(str(dataset_store))
    except Exception:
        assert False

    print("dataset obj:", ds)
    assert ds is not None

    assert hub_dir.is_dir(), hub_dir

    # validate num samples
    if num_samples is not None:
        assert num_samples == ds.shape[0]

    # validate num classes
    if num_classes is not None:
        actual_num_classes = len(np.unique(ds["label"].compute()))
        assert num_classes == actual_num_classes

    # validate image shape (this is for when all images are the same shape)
    actual_image_shape = ds["image"].shape
    if image_shape is not None:
        expected_image_shape = np.array((num_samples, *image_shape))
        assert np.array_equal(expected_image_shape, actual_image_shape)

    # validate image max shape (this is for when not all images are the same shape)
    if max_image_shape is not None:
        expected_max_image_shape = np.array((*max_image_shape, ))
        actual_max_image_shape = np.max(actual_image_shape, axis=0)
        assert np.array_equal(expected_max_image_shape, actual_max_image_shape)
def assert_conversion(tag):
    import pandas as pd

    dataset_store = get_dataset_store(tag)
    hub_dir = dataset_store / "hub"

    # delete hub dataset so conversion test can be done
    if hub_dir.is_dir():
        print("hub_dir was found (%s), deleting..." % hub_dir)
        shutil.rmtree(hub_dir)

    df = pd.DataFrame()
    files = get_children(dataset_store)
    for i in files:
        df_csv = pd.read_csv(i)
        df_csv["Filename"] = os.path.basename(i)
        df = pd.concat([df, df_csv])

    try:
        ds = hub.Dataset.from_path(str(dataset_store))
    except Exception:
        assert False

    print("dataset obj:", ds)
    assert ds is not None

    assert hub_dir.is_dir(), hub_dir

    # df = Pandas dataframe, ds = Dataset obtained from hub.auto
    if df is not None:
        assert ds.shape == (df.shape[0],)

    # Checking if the column names are the same
    keys_csv_parser = [i[1:] for i in ds.keys]
    keys_df = list(df.columns)
    assert keys_csv_parser == keys_df

    # Checking if all elements are parsed correctly
    for i in keys_df:
        column = []
        if df[i].dtype == np.dtype("O"):
            for j in range(df.shape[0]):
                column.append(ds[i, j].compute())
        else:
            column = ds[i].compute()
        assert list(column) == list(df[i])

    # Checking if the datatypes of the columns match
    for i in keys_csv_parser:
        if df[i].dtype == np.dtype("O"):
            assert ds[i].dtype == np.dtype("int64")
        else:
            assert ds[i].dtype == df[i].dtype

    # Checking if all the filenames are parsed correctly
    list_names = []
    for i in range(len(ds)):
        if ds["Filename", i].compute() in list_names:
            continue
        list_names.append(ds["Filename", i].compute())
    assert list(df["Filename"].unique()) == list_names