def test_schema_exception(): logger.info("test_schema_exception") with pytest.raises(TypeError) as info: ds.Schema(1) assert "Argument schema_file with value 1 is not of type (<class 'str'>,)" in str( info.value) with pytest.raises(RuntimeError) as info: schema = ds.Schema(SCHEMA_FILE) columns = [{'type': 'int8', 'shape': [3, 3]}] schema.parse_columns(columns) assert "Column's name is missing" in str(info.value)
def test_schema_file_vs_string(): logger.info("test_schema_file_vs_string") schema1 = ds.Schema(SCHEMA_FILE) with open(SCHEMA_FILE) as file: json_obj = json.load(file) schema2 = ds.Schema() schema2.from_json(json_obj) ds1 = ds.TFRecordDataset(FILES, schema1) ds2 = ds.TFRecordDataset(FILES, schema2) dataset_equal(ds1, ds2, 0)
def test_cache_nomap_basic1(): """ A random dataset (a non mappable dataset) with a cache over it just after the leaf """ logger.info("Test cache nomap basic 1") schema = ds.Schema() schema.add_column('image', de_type=mstype.uint8, shape=[640, 480, 3]) # 921600 bytes (a bit less than 1 MB per image) schema.add_column('label', de_type=mstype.uint8, shape=[1]) # create a cache. arbitrary session_id for now some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # User-created sampler here ds1 = ds.RandomDataset(schema=schema, total_rows=10, num_parallel_workers=4, cache=some_cache) ds1 = ds1.repeat(4) num_iter = 0 for data in ds1.create_dict_iterator(): logger.info("printing the label: {}".format(data["label"])) num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 40 logger.info("test_cache_nomap_basic1 Ended.\n")
def test_schema_exception(): logger.info("test_schema_exception") with pytest.raises(TypeError) as info: ds.Schema(1) assert "Argument schema_file with value 1 is not of type (<class 'str'>,)" in str( info.value)
def test_cache_nomap_basic2(): """ A random dataset (a non mappable dataset) with a cache over it just after the leaf """ logger.info("Test cache nomap basic 2") schema = ds.Schema() schema.add_column('image', de_type=mstype.uint8, shape=[640, 480, 3]) # 921600 bytes (a bit less than 1 MB per image) schema.add_column('label', de_type=mstype.uint8, shape=[1]) # create a cache. arbitrary session_id for now some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) # sampler arg not given directly, however any of these args will auto-generate an appropriate sampler: # num_samples, shuffle, num_shards, shard_id # In this case, the presence of num_samples chooses a sampler. ds1 = ds.RandomDataset(schema=schema, total_rows=20, num_samples=20, num_parallel_workers=4, cache=some_cache) ds1 = ds1.repeat(2) num_iter = 0 for data in ds1.create_dict_iterator(): logger.info("printing the label: {}".format(data["label"])) num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) assert num_iter == 40 logger.info("test_cache_nomap_basic2 Ended.\n")
def test_get_column_name_tfrecord(): data = ds.TFRecordDataset(TFRECORD_DIR, TFRECORD_SCHEMA) assert data.get_col_names() == [ "col_1d", "col_2d", "col_3d", "col_binary", "col_float", "col_sint16", "col_sint32", "col_sint64" ] data = ds.TFRecordDataset( TFRECORD_DIR, TFRECORD_SCHEMA, columns_list=["col_sint16", "col_sint64", "col_2d", "col_binary"]) assert data.get_col_names() == [ "col_sint16", "col_sint64", "col_2d", "col_binary" ] data = ds.TFRecordDataset(TFRECORD_DIR) assert data.get_col_names() == [ "col_1d", "col_2d", "col_3d", "col_binary", "col_float", "col_sint16", "col_sint32", "col_sint64", "col_sint8" ] s = ds.Schema() s.add_column("line", "string", []) s.add_column("words", "string", [-1]) s.add_column("chinese", "string", []) data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s) assert data.get_col_names() == ["line", "words", "chinese"]
def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, line_per_sample=1000, rank_size=None, rank_id=None, manual_shape=None, target_column=40): """ get_tf_dataset """ dataset_files = [] file_prefix_name = 'train' if train_mode else 'test' shuffle = train_mode for (dirpath, _, filenames) in os.walk(data_dir): for filename in filenames: if file_prefix_name in filename and "tfrecord" in filename: dataset_files.append(os.path.join(dirpath, filename)) schema = ds.Schema() schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32) if rank_size is not None and rank_id is not None: data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8, num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) else: data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8) data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) data_set = data_set.map(operations=_padding_func(batch_size, manual_shape, target_column), input_columns=['feat_ids', 'feat_vals', 'label'], column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) data_set = data_set.repeat(epochs) return data_set
def test_randomdataset_basic2(): logger.info("Test randomdataset basic 2") schema = ds.Schema() schema.add_column( 'image', de_type=mstype.uint8, shape=[640, 480, 3]) # 921600 bytes (a bit less than 1 MB per image) schema.add_column('label', de_type=mstype.uint8, shape=[1]) # Make up about 10 samples ds1 = ds.RandomDataset(schema=schema, num_samples=10, num_parallel_workers=1) # cache size allows for about 4 images since each image just a bit less than 1MB, after that we will have to spill ds1 = ds1.repeat(4) num_iter = 0 for data in ds1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" #logger.info(data["image"]) logger.info("printing the label: {}".format(data["label"])) num_iter += 1 logger.info("Number of data in ds1: ", num_iter) assert (num_iter == 40)
def test_randomdataset_basic2(): logger.info("Test randomdataset basic 2") schema = ds.Schema() schema.add_column( 'image', de_type=mstype.uint8, shape=[640, 480, 3]) # 921600 bytes (a bit less than 1 MB per image) schema.add_column('label', de_type=mstype.uint8, shape=[1]) # Make up 10 rows ds1 = ds.RandomDataset(schema=schema, total_rows=10, num_parallel_workers=1) ds1 = ds1.repeat(4) num_iter = 0 for data in ds1.create_dict_iterator( num_epochs=1): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" # logger.info(data["image"]) logger.info("printing the label: {}".format(data["label"])) num_iter += 1 logger.info("Number of data in ds1: {}".format(num_iter)) assert num_iter == 40 logger.info("Test randomdataset basic 2 complete")
def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, line_per_sample=1000, rank_size=None, rank_id=None): """ Get dataset with tfrecord format. Args: directory (str): Dataset directory. train_mode (bool): Whether dataset is use for train or eval (default=True). epochs (int): Dataset epoch size (default=1). batch_size (int): Dataset batch size (default=1000). line_per_sample (int): The number of sample per line (default=1000). rank_size (int): The number of device, not necessary for single device (default=None). rank_id (int): Id of device, not necessary for single device (default=None). Returns: Dataset. """ dataset_files = [] file_prefixt_name = 'train' if train_mode else 'test' shuffle = train_mode for (dir_path, _, filenames) in os.walk(directory): for filename in filenames: if file_prefixt_name in filename and 'tfrecord' in filename: dataset_files.append(os.path.join(dir_path, filename)) schema = ds.Schema() schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32) if rank_size is not None and rank_id is not None: data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8, num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) else: data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8) data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) data_set = data_set.map(operations=(lambda x, y, z: (np.array( x).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape( batch_size, 39), np.array(z).flatten().reshape(batch_size, 1))), input_columns=['feat_ids', 'feat_vals', 'label'], column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) data_set = data_set.repeat(epochs) return data_set
def test_tfrecord3(): s = ds.Schema() s.add_column("line", mstype.string, []) s.add_column("words", mstype.string, [-1, 2]) s.add_column("chinese", mstype.string, []) data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s) for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)): assert d["line"].shape == line[i].shape assert d["words"].shape == words[i].reshape([2, 2]).shape assert d["chinese"].shape == chinese[i].shape np.testing.assert_array_equal(line[i], to_str(d["line"])) np.testing.assert_array_equal(words[i].reshape([2, 2]), to_str(d["words"])) np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
def test_tfrecord1(): s = ds.Schema() s.add_column("line", "string", []) s.add_column("words", "string", [-1]) s.add_column("chinese", "string", []) data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s) for i, d in enumerate(data.create_dict_iterator()): assert d["line"].shape == line[i].shape assert d["words"].shape == words[i].shape assert d["chinese"].shape == chinese[i].shape np.testing.assert_array_equal(line[i], to_str(d["line"])) np.testing.assert_array_equal(words[i], to_str(d["words"])) np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
def test_tf_wrong_schema(): logger.info("test_tf_wrong_schema") files = ["../data/dataset/test_tf_file_3_images2/train-0000-of-0001.data"] schema = ds.Schema() schema.add_column('image', de_type=mstype.uint8, shape=[1]) schema.add_column('label', de_type=mstype.int64, shape=[1]) data1 = ds.TFRecordDataset(files, schema, shuffle=False) exception_occurred = False try: for _ in data1: pass except RuntimeError as e: exception_occurred = True assert "Shape in schema's column 'image' is incorrect" in str(e) assert exception_occurred, "test_tf_wrong_schema failed."
def type_tester_with_type_check_2c_schema(t, c): logger.info("Test with Type {}".format(t.__name__)) schema = ds.Schema() schema.add_column("data0", c[0]) schema.add_column("data1", c[1]) # apply dataset operations data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), schema=schema) data1 = data1.batch(4) i = 0 for item in data1.create_dict_iterator(): # each data is a dictionary golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t) assert np.array_equal(item["data0"], golden) i = i + 4
def test_tf_record_schema_columns_list(): schema = ds.Schema() schema.add_column('col_1d', de_type=mstype.int64, shape=[2]) schema.add_column('col_2d', de_type=mstype.int64, shape=[2, 2]) schema.add_column('col_3d', de_type=mstype.int64, shape=[2, 2, 2]) schema.add_column('col_binary', de_type=mstype.uint8, shape=[1]) schema.add_column('col_float', de_type=mstype.float32, shape=[1]) schema.add_column('col_sint16', de_type=mstype.int64, shape=[1]) schema.add_column('col_sint32', de_type=mstype.int64, shape=[1]) schema.add_column('col_sint64', de_type=mstype.int64, shape=[1]) data = ds.TFRecordDataset(FILES, schema=schema, shuffle=False, columns_list=["col_sint16"]) row = data.create_dict_iterator().get_next() assert row["col_sint16"] == [-32768] with pytest.raises(KeyError) as info: a = row["col_sint32"] assert "col_sint32" in str(info.value)
def test_tf_record_schema(): schema = ds.Schema() schema.add_column('col_1d', de_type=mstype.int64, shape=[2]) schema.add_column('col_2d', de_type=mstype.int64, shape=[2, 2]) schema.add_column('col_3d', de_type=mstype.int64, shape=[2, 2, 2]) schema.add_column('col_binary', de_type=mstype.uint8, shape=[1]) schema.add_column('col_float', de_type=mstype.float32, shape=[1]) schema.add_column('col_sint16', de_type=mstype.int64, shape=[1]) schema.add_column('col_sint32', de_type=mstype.int64, shape=[1]) schema.add_column('col_sint64', de_type=mstype.int64, shape=[1]) data1 = ds.TFRecordDataset(FILES, schema=schema, shuffle=ds.Shuffle.FILES) data2 = ds.TFRecordDataset(FILES, schema=SCHEMA_FILE, shuffle=ds.Shuffle.FILES) for d1, d2 in zip(data1, data2): for t1, t2 in zip(d1, d2): assert np.array_equal(t1, t2)
def test_randomdataset_basic1(): logger.info("Test randomdataset basic 1") schema = ds.Schema() schema.add_column('image', de_type=mstype.uint8, shape=[2]) schema.add_column('label', de_type=mstype.uint8, shape=[1]) # apply dataset operations ds1 = ds.RandomDataset(schema=schema, total_rows=50, num_parallel_workers=4) ds1 = ds1.repeat(4) num_iter = 0 for data in ds1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("{} image: {}".format(num_iter, data["image"])) logger.info("{} label: {}".format(num_iter, data["label"])) num_iter += 1 logger.info("Number of data in ds1: {}".format(num_iter)) assert num_iter == 200 logger.info("Test randomdataset basic 1 complete")
def _get_tf_dataset(data_dir, schema_dict, input_shape_dict, train_mode=True, epochs=1, batch_size=4096, line_per_sample=4096, rank_size=None, rank_id=None): """ _get_tf_dataset """ dataset_files = [] file_prefix_name = 'train' if train_mode else 'eval' shuffle = bool(train_mode) for (dirpath, _, filenames) in os.walk(data_dir): for filename in filenames: if file_prefix_name in filename and "tfrecord" in filename: dataset_files.append(os.path.join(dirpath, filename)) schema = ds.Schema() float_key_list = ["label", "continue_val"] columns_list = [] for key, attr_dict in schema_dict.items(): print("key: {}; shape: {}".format(key, attr_dict["tf_shape"])) columns_list.append(key) if key in set(float_key_list): ms_dtype = mstype.float32 else: ms_dtype = mstype.int32 schema.add_column(key, de_type=ms_dtype) if rank_size is not None and rank_id is not None: data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8, num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) else: data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8) if batch_size <= 0: raise ValueError( "Batch size should be a positive int value, but found {}".format( str(batch_size))) if batch_size % line_per_sample != 0: raise ValueError( "Batch size should be a multiple of {}, but found {}".format( str(line_per_sample), str(batch_size))) data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) operations_list = [] for key in columns_list: operations_list.append( lambda x: np.array(x).flatten().reshape(input_shape_dict[key])) print("input_shape_dict start logging") print(input_shape_dict) print("input_shape_dict end logging") print(schema_dict) def mixup(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u): a = np.asarray(a.reshape(batch_size, )) b = np.array(b).flatten().reshape(batch_size, -1) c = np.array(c).flatten().reshape(batch_size, -1) d = np.array(d).flatten().reshape(batch_size, -1) e = np.array(e).flatten().reshape(batch_size, -1) f = np.array(f).flatten().reshape(batch_size, -1) g = np.array(g).flatten().reshape(batch_size, -1) h = np.array(h).flatten().reshape(batch_size, -1) i = np.array(i).flatten().reshape(batch_size, -1) j = np.array(j).flatten().reshape(batch_size, -1) k = np.array(k).flatten().reshape(batch_size, -1) l = np.array(l).flatten().reshape(batch_size, -1) m = np.array(m).flatten().reshape(batch_size, -1) n = np.array(n).flatten().reshape(batch_size, -1) o = np.array(o).flatten().reshape(batch_size, -1) p = np.array(p).flatten().reshape(batch_size, -1) q = np.array(q).flatten().reshape(batch_size, -1) r = np.array(r).flatten().reshape(batch_size, -1) s = np.array(s).flatten().reshape(batch_size, -1) t = np.array(t).flatten().reshape(batch_size, -1) u = np.array(u).flatten().reshape(batch_size, -1) return a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u data_set = data_set.map( operations=mixup, input_columns=[ 'label', 'continue_val', 'indicator_id', 'emb_128_id', 'emb_64_single_id', 'multi_doc_ad_category_id', 'multi_doc_ad_category_id_mask', 'multi_doc_event_entity_id', 'multi_doc_event_entity_id_mask', 'multi_doc_ad_entity_id', 'multi_doc_ad_entity_id_mask', 'multi_doc_event_topic_id', 'multi_doc_event_topic_id_mask', 'multi_doc_event_category_id', 'multi_doc_event_category_id_mask', 'multi_doc_ad_topic_id', 'multi_doc_ad_topic_id_mask', 'ad_id', 'display_ad_and_is_leak', 'display_id', 'is_leak' ], column_order=[ 'label', 'continue_val', 'indicator_id', 'emb_128_id', 'emb_64_single_id', 'multi_doc_ad_category_id', 'multi_doc_ad_category_id_mask', 'multi_doc_event_entity_id', 'multi_doc_event_entity_id_mask', 'multi_doc_ad_entity_id', 'multi_doc_ad_entity_id_mask', 'multi_doc_event_topic_id', 'multi_doc_event_topic_id_mask', 'multi_doc_event_category_id', 'multi_doc_event_category_id_mask', 'multi_doc_ad_topic_id', 'multi_doc_ad_topic_id_mask', 'display_id', 'ad_id', 'display_ad_and_is_leak', 'is_leak' ], num_parallel_workers=8) data_set = data_set.repeat(epochs) return data_set
def test_schema_simple(): logger.info("test_schema_simple") ds.Schema(SCHEMA_FILE)
def test_simple_schema(): ds.Schema(SCHEMA_FILE)