def assert_conversion(tag, num_samples=None, num_classes=None, image_shape=None, max_image_shape=None): """ tries to create a dataset for the kaggle_tag & then convert it into hub format. """ dataset_store = get_dataset_store(tag) hub_dir = dataset_store / "hub" # delete hub dataset so conversion test can be done if hub_dir.is_dir(): print("hub_dir was found (%s), deleting..." % hub_dir) shutil.rmtree(hub_dir) try: ds = hub.Dataset.from_path(str(dataset_store)) except Exception: assert False print("dataset obj:", ds) assert ds is not None assert hub_dir.is_dir(), hub_dir # validate num samples if num_samples is not None: assert num_samples == ds.shape[0] # validate num classes if num_classes is not None: actual_num_classes = len(np.unique(ds["label"].compute())) assert num_classes == actual_num_classes # validate image shape (this is for when all images are the same shape) actual_image_shape = ds["image"].shape if image_shape is not None: expected_image_shape = np.array((num_samples, *image_shape)) assert np.array_equal(expected_image_shape, actual_image_shape) # validate image max shape (this is for when not all images are the same shape) if max_image_shape is not None: expected_max_image_shape = np.array((*max_image_shape, )) actual_max_image_shape = np.max(actual_image_shape, axis=0) assert np.array_equal(expected_max_image_shape, actual_max_image_shape)
def assert_conversion(tag): import pandas as pd dataset_store = get_dataset_store(tag) hub_dir = dataset_store / "hub" # delete hub dataset so conversion test can be done if hub_dir.is_dir(): print("hub_dir was found (%s), deleting..." % hub_dir) shutil.rmtree(hub_dir) df = pd.DataFrame() files = get_children(dataset_store) for i in files: df_csv = pd.read_csv(i) df_csv["Filename"] = os.path.basename(i) df = pd.concat([df, df_csv]) try: ds = hub.Dataset.from_path(str(dataset_store)) except Exception: assert False print("dataset obj:", ds) assert ds is not None assert hub_dir.is_dir(), hub_dir # df = Pandas dataframe, ds = Dataset obtained from hub.auto if df is not None: assert ds.shape == (df.shape[0],) # Checking if the column names are the same keys_csv_parser = [i[1:] for i in ds.keys] keys_df = list(df.columns) assert keys_csv_parser == keys_df # Checking if all elements are parsed correctly for i in keys_df: column = [] if df[i].dtype == np.dtype("O"): for j in range(df.shape[0]): column.append(ds[i, j].compute()) else: column = ds[i].compute() assert list(column) == list(df[i]) # Checking if the datatypes of the columns match for i in keys_csv_parser: if df[i].dtype == np.dtype("O"): assert ds[i].dtype == np.dtype("int64") else: assert ds[i].dtype == df[i].dtype # Checking if all the filenames are parsed correctly list_names = [] for i in range(len(ds)): if ds["Filename", i].compute() in list_names: continue list_names.append(ds["Filename", i].compute()) assert list(df["Filename"].unique()) == list_names