Example #1
0
def test_schema_exception():
    logger.info("test_schema_exception")

    with pytest.raises(TypeError) as info:
        ds.Schema(1)
    assert "Argument schema_file with value 1 is not of type (<class 'str'>,)" in str(
        info.value)

    with pytest.raises(RuntimeError) as info:
        schema = ds.Schema(SCHEMA_FILE)
        columns = [{'type': 'int8', 'shape': [3, 3]}]
        schema.parse_columns(columns)
    assert "Column's name is missing" in str(info.value)
Example #2
0
def test_schema_file_vs_string():
    logger.info("test_schema_file_vs_string")

    schema1 = ds.Schema(SCHEMA_FILE)
    with open(SCHEMA_FILE) as file:
        json_obj = json.load(file)
        schema2 = ds.Schema()
        schema2.from_json(json_obj)

    ds1 = ds.TFRecordDataset(FILES, schema1)
    ds2 = ds.TFRecordDataset(FILES, schema2)

    dataset_equal(ds1, ds2, 0)
Example #3
0
def test_cache_nomap_basic1():
    """
    A random dataset (a non mappable dataset) with a cache over it just after the leaf
    """

    logger.info("Test cache nomap basic 1")

    schema = ds.Schema()
    schema.add_column('image', de_type=mstype.uint8,
                      shape=[640, 480, 3])  # 921600 bytes (a bit less than 1 MB per image)
    schema.add_column('label', de_type=mstype.uint8, shape=[1])

    # create a cache.  arbitrary session_id for now
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # User-created sampler here
    ds1 = ds.RandomDataset(schema=schema, total_rows=10, num_parallel_workers=4, cache=some_cache)
    ds1 = ds1.repeat(4)

    num_iter = 0
    for data in ds1.create_dict_iterator():
        logger.info("printing the label: {}".format(data["label"]))
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 40
    logger.info("test_cache_nomap_basic1 Ended.\n")
Example #4
0
def test_schema_exception():
    logger.info("test_schema_exception")

    with pytest.raises(TypeError) as info:
        ds.Schema(1)
    assert "Argument schema_file with value 1 is not of type (<class 'str'>,)" in str(
        info.value)
Example #5
0
def test_cache_nomap_basic2():
    """
    A random dataset (a non mappable dataset) with a cache over it just after the leaf
    """

    logger.info("Test cache nomap basic 2")

    schema = ds.Schema()
    schema.add_column('image', de_type=mstype.uint8,
                      shape=[640, 480, 3])  # 921600 bytes (a bit less than 1 MB per image)
    schema.add_column('label', de_type=mstype.uint8, shape=[1])

    # create a cache.  arbitrary session_id for now
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # sampler arg not given directly, however any of these args will auto-generate an appropriate sampler:
    # num_samples, shuffle, num_shards, shard_id
    # In this case, the presence of num_samples chooses a sampler.
    ds1 = ds.RandomDataset(schema=schema, total_rows=20, num_samples=20, num_parallel_workers=4, cache=some_cache)
    ds1 = ds1.repeat(2)

    num_iter = 0
    for data in ds1.create_dict_iterator():
        logger.info("printing the label: {}".format(data["label"]))
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 40
    logger.info("test_cache_nomap_basic2 Ended.\n")
Example #6
0
def test_get_column_name_tfrecord():
    data = ds.TFRecordDataset(TFRECORD_DIR, TFRECORD_SCHEMA)
    assert data.get_col_names() == [
        "col_1d", "col_2d", "col_3d", "col_binary", "col_float", "col_sint16",
        "col_sint32", "col_sint64"
    ]
    data = ds.TFRecordDataset(
        TFRECORD_DIR,
        TFRECORD_SCHEMA,
        columns_list=["col_sint16", "col_sint64", "col_2d", "col_binary"])
    assert data.get_col_names() == [
        "col_sint16", "col_sint64", "col_2d", "col_binary"
    ]

    data = ds.TFRecordDataset(TFRECORD_DIR)
    assert data.get_col_names() == [
        "col_1d", "col_2d", "col_3d", "col_binary", "col_float", "col_sint16",
        "col_sint32", "col_sint64", "col_sint8"
    ]
    s = ds.Schema()
    s.add_column("line", "string", [])
    s.add_column("words", "string", [-1])
    s.add_column("chinese", "string", [])

    data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord",
                              shuffle=False,
                              schema=s)
    assert data.get_col_names() == ["line", "words", "chinese"]
Example #7
0
def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
                    line_per_sample=1000, rank_size=None, rank_id=None,
                    manual_shape=None, target_column=40):
    """
    get_tf_dataset
    """
    dataset_files = []
    file_prefix_name = 'train' if train_mode else 'test'
    shuffle = train_mode
    for (dirpath, _, filenames) in os.walk(data_dir):
        for filename in filenames:
            if file_prefix_name in filename and "tfrecord" in filename:
                dataset_files.append(os.path.join(dirpath, filename))
    schema = ds.Schema()
    schema.add_column('feat_ids', de_type=mstype.int32)
    schema.add_column('feat_vals', de_type=mstype.float32)
    schema.add_column('label', de_type=mstype.float32)
    if rank_size is not None and rank_id is not None:
        data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema,
                                      num_parallel_workers=8,
                                      num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
    else:
        data_set = ds.TFRecordDataset(dataset_files=dataset_files,
                                      shuffle=shuffle, schema=schema, num_parallel_workers=8)
    data_set = data_set.batch(int(batch_size / line_per_sample),
                              drop_remainder=True)

    data_set = data_set.map(operations=_padding_func(batch_size, manual_shape, target_column),
                            input_columns=['feat_ids', 'feat_vals', 'label'],
                            column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
    data_set = data_set.repeat(epochs)
    return data_set
def test_randomdataset_basic2():
    logger.info("Test randomdataset basic 2")

    schema = ds.Schema()
    schema.add_column(
        'image', de_type=mstype.uint8,
        shape=[640, 480, 3])  # 921600 bytes (a bit less than 1 MB per image)
    schema.add_column('label', de_type=mstype.uint8, shape=[1])

    # Make up about 10 samples
    ds1 = ds.RandomDataset(schema=schema,
                           num_samples=10,
                           num_parallel_workers=1)

    # cache size allows for about 4 images since each image just a bit less than 1MB, after that we will have to spill
    ds1 = ds1.repeat(4)

    num_iter = 0
    for data in ds1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        #logger.info(data["image"])
        logger.info("printing the label: {}".format(data["label"]))
        num_iter += 1

    logger.info("Number of data in ds1: ", num_iter)
    assert (num_iter == 40)
Example #9
0
def test_randomdataset_basic2():
    logger.info("Test randomdataset basic 2")

    schema = ds.Schema()
    schema.add_column(
        'image', de_type=mstype.uint8,
        shape=[640, 480, 3])  # 921600 bytes (a bit less than 1 MB per image)
    schema.add_column('label', de_type=mstype.uint8, shape=[1])

    # Make up 10 rows
    ds1 = ds.RandomDataset(schema=schema,
                           total_rows=10,
                           num_parallel_workers=1)
    ds1 = ds1.repeat(4)

    num_iter = 0
    for data in ds1.create_dict_iterator(
            num_epochs=1):  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        # logger.info(data["image"])
        logger.info("printing the label: {}".format(data["label"]))
        num_iter += 1

    logger.info("Number of data in ds1: {}".format(num_iter))
    assert num_iter == 40
    logger.info("Test randomdataset basic 2 complete")
Example #10
0
def _get_tf_dataset(directory,
                    train_mode=True,
                    epochs=1,
                    batch_size=1000,
                    line_per_sample=1000,
                    rank_size=None,
                    rank_id=None):
    """
    Get dataset with tfrecord format.

    Args:
        directory (str): Dataset directory.
        train_mode (bool): Whether dataset is use for train or eval (default=True).
        epochs (int): Dataset epoch size (default=1).
        batch_size (int): Dataset batch size (default=1000).
        line_per_sample (int): The number of sample per line (default=1000).
        rank_size (int): The number of device, not necessary for single device (default=None).
        rank_id (int): Id of device, not necessary for single device (default=None).

    Returns:
        Dataset.
    """
    dataset_files = []
    file_prefixt_name = 'train' if train_mode else 'test'
    shuffle = train_mode
    for (dir_path, _, filenames) in os.walk(directory):
        for filename in filenames:
            if file_prefixt_name in filename and 'tfrecord' in filename:
                dataset_files.append(os.path.join(dir_path, filename))
    schema = ds.Schema()
    schema.add_column('feat_ids', de_type=mstype.int32)
    schema.add_column('feat_vals', de_type=mstype.float32)
    schema.add_column('label', de_type=mstype.float32)
    if rank_size is not None and rank_id is not None:
        data_set = ds.TFRecordDataset(dataset_files=dataset_files,
                                      shuffle=shuffle,
                                      schema=schema,
                                      num_parallel_workers=8,
                                      num_shards=rank_size,
                                      shard_id=rank_id,
                                      shard_equal_rows=True)
    else:
        data_set = ds.TFRecordDataset(dataset_files=dataset_files,
                                      shuffle=shuffle,
                                      schema=schema,
                                      num_parallel_workers=8)
    data_set = data_set.batch(int(batch_size / line_per_sample),
                              drop_remainder=True)
    data_set = data_set.map(operations=(lambda x, y, z: (np.array(
        x).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(
            batch_size, 39), np.array(z).flatten().reshape(batch_size, 1))),
                            input_columns=['feat_ids', 'feat_vals', 'label'],
                            column_order=['feat_ids', 'feat_vals', 'label'],
                            num_parallel_workers=8)
    data_set = data_set.repeat(epochs)
    return data_set
Example #11
0
def test_tfrecord3():
    s = ds.Schema()
    s.add_column("line", mstype.string, [])
    s.add_column("words", mstype.string, [-1, 2])
    s.add_column("chinese", mstype.string, [])

    data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s)

    for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)):
        assert d["line"].shape == line[i].shape
        assert d["words"].shape == words[i].reshape([2, 2]).shape
        assert d["chinese"].shape == chinese[i].shape
        np.testing.assert_array_equal(line[i], to_str(d["line"]))
        np.testing.assert_array_equal(words[i].reshape([2, 2]), to_str(d["words"]))
        np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
def test_tfrecord1():
    s = ds.Schema()
    s.add_column("line", "string", [])
    s.add_column("words", "string", [-1])
    s.add_column("chinese", "string", [])

    data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s)

    for i, d in enumerate(data.create_dict_iterator()):
        assert d["line"].shape == line[i].shape
        assert d["words"].shape == words[i].shape
        assert d["chinese"].shape == chinese[i].shape
        np.testing.assert_array_equal(line[i], to_str(d["line"]))
        np.testing.assert_array_equal(words[i], to_str(d["words"]))
        np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
def test_tf_wrong_schema():
    logger.info("test_tf_wrong_schema")
    files = ["../data/dataset/test_tf_file_3_images2/train-0000-of-0001.data"]
    schema = ds.Schema()
    schema.add_column('image', de_type=mstype.uint8, shape=[1])
    schema.add_column('label', de_type=mstype.int64, shape=[1])
    data1 = ds.TFRecordDataset(files, schema, shuffle=False)
    exception_occurred = False
    try:
        for _ in data1:
            pass
    except RuntimeError as e:
        exception_occurred = True
        assert "Shape in schema's column 'image' is incorrect" in str(e)

    assert exception_occurred, "test_tf_wrong_schema failed."
Example #14
0
def type_tester_with_type_check_2c_schema(t, c):
    logger.info("Test with Type {}".format(t.__name__))

    schema = ds.Schema()
    schema.add_column("data0", c[0])
    schema.add_column("data1", c[1])

    # apply dataset operations
    data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), schema=schema)

    data1 = data1.batch(4)

    i = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
        assert np.array_equal(item["data0"], golden)
        i = i + 4
Example #15
0
def test_tf_record_schema_columns_list():
    schema = ds.Schema()
    schema.add_column('col_1d', de_type=mstype.int64, shape=[2])
    schema.add_column('col_2d', de_type=mstype.int64, shape=[2, 2])
    schema.add_column('col_3d', de_type=mstype.int64, shape=[2, 2, 2])
    schema.add_column('col_binary', de_type=mstype.uint8, shape=[1])
    schema.add_column('col_float', de_type=mstype.float32, shape=[1])
    schema.add_column('col_sint16', de_type=mstype.int64, shape=[1])
    schema.add_column('col_sint32', de_type=mstype.int64, shape=[1])
    schema.add_column('col_sint64', de_type=mstype.int64, shape=[1])
    data = ds.TFRecordDataset(FILES, schema=schema, shuffle=False, columns_list=["col_sint16"])
    row = data.create_dict_iterator().get_next()
    assert row["col_sint16"] == [-32768]

    with pytest.raises(KeyError) as info:
        a = row["col_sint32"]
    assert "col_sint32" in str(info.value)
Example #16
0
def test_tf_record_schema():
    schema = ds.Schema()
    schema.add_column('col_1d', de_type=mstype.int64, shape=[2])
    schema.add_column('col_2d', de_type=mstype.int64, shape=[2, 2])
    schema.add_column('col_3d', de_type=mstype.int64, shape=[2, 2, 2])
    schema.add_column('col_binary', de_type=mstype.uint8, shape=[1])
    schema.add_column('col_float', de_type=mstype.float32, shape=[1])
    schema.add_column('col_sint16', de_type=mstype.int64, shape=[1])
    schema.add_column('col_sint32', de_type=mstype.int64, shape=[1])
    schema.add_column('col_sint64', de_type=mstype.int64, shape=[1])
    data1 = ds.TFRecordDataset(FILES, schema=schema, shuffle=ds.Shuffle.FILES)

    data2 = ds.TFRecordDataset(FILES, schema=SCHEMA_FILE, shuffle=ds.Shuffle.FILES)

    for d1, d2 in zip(data1, data2):
        for t1, t2 in zip(d1, d2):
            assert np.array_equal(t1, t2)
def test_randomdataset_basic1():
    logger.info("Test randomdataset basic 1")

    schema = ds.Schema()
    schema.add_column('image', de_type=mstype.uint8, shape=[2])
    schema.add_column('label', de_type=mstype.uint8, shape=[1])

    # apply dataset operations
    ds1 = ds.RandomDataset(schema=schema, total_rows=50, num_parallel_workers=4)
    ds1 = ds1.repeat(4)

    num_iter = 0
    for data in ds1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("{} image: {}".format(num_iter, data["image"]))
        logger.info("{} label: {}".format(num_iter, data["label"]))
        num_iter += 1

    logger.info("Number of data in ds1: {}".format(num_iter))
    assert num_iter == 200
    logger.info("Test randomdataset basic 1 complete")
Example #18
0
def _get_tf_dataset(data_dir,
                    schema_dict,
                    input_shape_dict,
                    train_mode=True,
                    epochs=1,
                    batch_size=4096,
                    line_per_sample=4096,
                    rank_size=None,
                    rank_id=None):
    """
    _get_tf_dataset
    """
    dataset_files = []
    file_prefix_name = 'train' if train_mode else 'eval'
    shuffle = bool(train_mode)
    for (dirpath, _, filenames) in os.walk(data_dir):
        for filename in filenames:
            if file_prefix_name in filename and "tfrecord" in filename:
                dataset_files.append(os.path.join(dirpath, filename))
    schema = ds.Schema()

    float_key_list = ["label", "continue_val"]

    columns_list = []
    for key, attr_dict in schema_dict.items():
        print("key: {}; shape: {}".format(key, attr_dict["tf_shape"]))
        columns_list.append(key)
        if key in set(float_key_list):
            ms_dtype = mstype.float32
        else:
            ms_dtype = mstype.int32
        schema.add_column(key, de_type=ms_dtype)

    if rank_size is not None and rank_id is not None:
        data_set = ds.TFRecordDataset(dataset_files=dataset_files,
                                      shuffle=shuffle,
                                      schema=schema,
                                      num_parallel_workers=8,
                                      num_shards=rank_size,
                                      shard_id=rank_id,
                                      shard_equal_rows=True)
    else:
        data_set = ds.TFRecordDataset(dataset_files=dataset_files,
                                      shuffle=shuffle,
                                      schema=schema,
                                      num_parallel_workers=8)
    if batch_size <= 0:
        raise ValueError(
            "Batch size should be a positive int value, but found {}".format(
                str(batch_size)))
    if batch_size % line_per_sample != 0:
        raise ValueError(
            "Batch size should be a multiple of {}, but found {}".format(
                str(line_per_sample), str(batch_size)))

    data_set = data_set.batch(int(batch_size / line_per_sample),
                              drop_remainder=True)

    operations_list = []
    for key in columns_list:
        operations_list.append(
            lambda x: np.array(x).flatten().reshape(input_shape_dict[key]))
    print("input_shape_dict start logging")
    print(input_shape_dict)
    print("input_shape_dict end logging")
    print(schema_dict)

    def mixup(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u):
        a = np.asarray(a.reshape(batch_size, ))
        b = np.array(b).flatten().reshape(batch_size, -1)
        c = np.array(c).flatten().reshape(batch_size, -1)
        d = np.array(d).flatten().reshape(batch_size, -1)
        e = np.array(e).flatten().reshape(batch_size, -1)

        f = np.array(f).flatten().reshape(batch_size, -1)
        g = np.array(g).flatten().reshape(batch_size, -1)
        h = np.array(h).flatten().reshape(batch_size, -1)
        i = np.array(i).flatten().reshape(batch_size, -1)
        j = np.array(j).flatten().reshape(batch_size, -1)

        k = np.array(k).flatten().reshape(batch_size, -1)
        l = np.array(l).flatten().reshape(batch_size, -1)
        m = np.array(m).flatten().reshape(batch_size, -1)
        n = np.array(n).flatten().reshape(batch_size, -1)
        o = np.array(o).flatten().reshape(batch_size, -1)

        p = np.array(p).flatten().reshape(batch_size, -1)
        q = np.array(q).flatten().reshape(batch_size, -1)
        r = np.array(r).flatten().reshape(batch_size, -1)
        s = np.array(s).flatten().reshape(batch_size, -1)
        t = np.array(t).flatten().reshape(batch_size, -1)

        u = np.array(u).flatten().reshape(batch_size, -1)
        return a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u

    data_set = data_set.map(
        operations=mixup,
        input_columns=[
            'label', 'continue_val', 'indicator_id', 'emb_128_id',
            'emb_64_single_id', 'multi_doc_ad_category_id',
            'multi_doc_ad_category_id_mask', 'multi_doc_event_entity_id',
            'multi_doc_event_entity_id_mask', 'multi_doc_ad_entity_id',
            'multi_doc_ad_entity_id_mask', 'multi_doc_event_topic_id',
            'multi_doc_event_topic_id_mask', 'multi_doc_event_category_id',
            'multi_doc_event_category_id_mask', 'multi_doc_ad_topic_id',
            'multi_doc_ad_topic_id_mask', 'ad_id', 'display_ad_and_is_leak',
            'display_id', 'is_leak'
        ],
        column_order=[
            'label', 'continue_val', 'indicator_id', 'emb_128_id',
            'emb_64_single_id', 'multi_doc_ad_category_id',
            'multi_doc_ad_category_id_mask', 'multi_doc_event_entity_id',
            'multi_doc_event_entity_id_mask', 'multi_doc_ad_entity_id',
            'multi_doc_ad_entity_id_mask', 'multi_doc_event_topic_id',
            'multi_doc_event_topic_id_mask', 'multi_doc_event_category_id',
            'multi_doc_event_category_id_mask', 'multi_doc_ad_topic_id',
            'multi_doc_ad_topic_id_mask', 'display_id', 'ad_id',
            'display_ad_and_is_leak', 'is_leak'
        ],
        num_parallel_workers=8)

    data_set = data_set.repeat(epochs)
    return data_set
Example #19
0
def test_schema_simple():
    logger.info("test_schema_simple")
    ds.Schema(SCHEMA_FILE)
Example #20
0
def test_simple_schema():
    ds.Schema(SCHEMA_FILE)