def skip_test_minddataset(add_and_remove_cv_file=True):
    """tutorial for cv minderdataset."""
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    indices = [1, 2, 3, 5, 7]
    sampler = ds.SubsetRandomSampler(indices)
    data_set = ds.MindDataset(CV_FILE_NAME + "0",
                              columns_list,
                              num_readers,
                              sampler=sampler)

    # Serializing into python dictionary
    ds1_dict = ds.serialize(data_set)
    # Serializing into json object
    ds1_json = json.dumps(ds1_dict, sort_keys=True)

    # Reconstruct dataset pipeline from its serialized form
    data_set = ds.deserialize(input_dict=ds1_dict)
    ds2_dict = ds.serialize(data_set)
    # Serializing into json object
    ds2_json = json.dumps(ds2_dict, sort_keys=True)

    assert ds1_json == ds2_json

    _ = get_data(CV_DIR_NAME)
    assert data_set.get_dataset_size() == 5
    num_iter = 0
    for _ in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
        num_iter += 1
    assert num_iter == 5
def test_pipeline():
    """ 
    Test that our configuration pipeline works when we set parameters at dataset interval 
    """
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
    ds.config.set_num_parallel_workers(2)
    data1 = data1.map(input_columns=["image"],
                      operations=[vision.Decode(True)])
    ds.serialize(data1, "testpipeline.json")

    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
    ds.config.set_num_parallel_workers(4)
    data2 = data2.map(input_columns=["image"],
                      operations=[vision.Decode(True)])
    ds.serialize(data2, "testpipeline2.json")

    # check that the generated output is different
    assert (filecmp.cmp('testpipeline.json', 'testpipeline2.json'))

    # this test passes currently because our num_parallel_workers don't get updated.

    # remove generated jason files
    file_list = glob.glob('*.json')
    for f in file_list:
        try:
            os.remove(f)
        except IOError:
            logger.info("Error while deleting: {}".format(f))
def util_check_serialize_deserialize_file(data_orig, filename,
                                          remove_json_files):
    """
    Utility function for testing serdes files. It is to check if a json file is indeed created with correct name
    after serializing and if it remains the same after repeatedly saving and loading.
    :param data_orig: original data pipeline to be serialized
    :param filename: filename to be saved as json format
    :param remove_json_files: whether to remove the json file after testing
    :return: The data pipeline after serializing and deserializing using the original pipeline
    """
    file1 = filename + ".json"
    file2 = filename + "_1.json"
    ds.serialize(data_orig, file1)
    assert validate_jsonfile(file1) is True
    assert validate_jsonfile("wrong_name.json") is False

    data_changed = ds.deserialize(json_filepath=file1)
    ds.serialize(data_changed, file2)
    assert validate_jsonfile(file2) is True
    assert filecmp.cmp(file1, file2)

    # Remove the generated json file
    if remove_json_files:
        delete_json_files()
    return data_changed
def test_mnist_dataset(remove_json_files=True):
    data_dir = "../data/dataset/testMnistData"
    ds.config.set_seed(1)

    data1 = ds.MnistDataset(data_dir, 100)
    one_hot_encode = c.OneHot(10)  # num_classes is input argument
    data1 = data1.map(input_columns="label", operations=one_hot_encode)

    # batch_size is input argument
    data1 = data1.batch(batch_size=10, drop_remainder=True)

    ds.serialize(data1, "mnist_dataset_pipeline.json")
    assert validate_jsonfile("mnist_dataset_pipeline.json") is True

    data2 = ds.deserialize(json_filepath="mnist_dataset_pipeline.json")
    ds.serialize(data2, "mnist_dataset_pipeline_1.json")
    assert validate_jsonfile("mnist_dataset_pipeline_1.json") is True
    assert filecmp.cmp('mnist_dataset_pipeline.json', 'mnist_dataset_pipeline_1.json')

    data3 = ds.deserialize(json_filepath="mnist_dataset_pipeline_1.json")

    num = 0
    for data1, data2, data3 in zip(data1.create_dict_iterator(), data2.create_dict_iterator(),
                                   data3.create_dict_iterator()):
        assert np.array_equal(data1['image'], data2['image'])
        assert np.array_equal(data1['image'], data3['image'])
        assert np.array_equal(data1['label'], data2['label'])
        assert np.array_equal(data1['label'], data3['label'])
        num += 1

    logger.info("mnist total num samples is {}".format(str(num)))
    assert num == 10

    if remove_json_files:
        delete_json_files()
def test_serdes_zip_dataset(remove_json_files=True):
    """
    Test serdes on zip dataset pipeline.
    """
    files = ["../data/dataset/testTFTestAllTypes/test.data"]
    schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
    ds.config.set_seed(1)

    ds0 = ds.TFRecordDataset(files,
                             schema=schema_file,
                             shuffle=ds.Shuffle.GLOBAL)
    data1 = ds.TFRecordDataset(files,
                               schema=schema_file,
                               shuffle=ds.Shuffle.GLOBAL)
    data2 = ds.TFRecordDataset(files,
                               schema=schema_file,
                               shuffle=ds.Shuffle.FILES)
    data2 = data2.shuffle(10000)
    data2 = data2.rename(input_columns=[
        "col_sint16", "col_sint32", "col_sint64", "col_float", "col_1d",
        "col_2d", "col_3d", "col_binary"
    ],
                         output_columns=[
                             "column_sint16", "column_sint32", "column_sint64",
                             "column_float", "column_1d", "column_2d",
                             "column_3d", "column_binary"
                         ])
    data3 = ds.zip((data1, data2))
    ds.serialize(data3, "zip_dataset_pipeline.json")
    assert validate_jsonfile("zip_dataset_pipeline.json") is True
    assert validate_jsonfile("zip_dataset_pipeline_typo.json") is False

    data4 = ds.deserialize(json_filepath="zip_dataset_pipeline.json")
    ds.serialize(data4, "zip_dataset_pipeline_1.json")
    assert validate_jsonfile("zip_dataset_pipeline_1.json") is True
    assert filecmp.cmp('zip_dataset_pipeline.json',
                       'zip_dataset_pipeline_1.json')

    rows = 0
    for d0, d3, d4 in zip(ds0.create_tuple_iterator(output_numpy=True),
                          data3.create_tuple_iterator(output_numpy=True),
                          data4.create_tuple_iterator(output_numpy=True)):
        num_cols = len(d0)
        offset = 0
        for t1 in d0:
            np.testing.assert_array_equal(t1, d3[offset])
            np.testing.assert_array_equal(t1, d3[offset + num_cols])
            np.testing.assert_array_equal(t1, d4[offset])
            np.testing.assert_array_equal(t1, d4[offset + num_cols])
            offset += 1
        rows += 1
    assert rows == 12

    if remove_json_files:
        delete_json_files()
def test_random_crop():
    logger.info("test_random_crop")
    DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
    SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"

    # First dataset
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"])
    decode_op = vision.Decode()
    random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200])
    data1 = data1.map(input_columns="image", operations=decode_op)
    data1 = data1.map(input_columns="image", operations=random_crop_op)

    # Serializing into python dictionary
    ds1_dict = ds.serialize(data1)
    # Serializing into json object
    _ = json.dumps(ds1_dict, indent=2)

    # Reconstruct dataset pipeline from its serialized form
    data1_1 = ds.deserialize(input_dict=ds1_dict)

    # Second dataset
    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"])
    data2 = data2.map(input_columns="image", operations=decode_op)

    for item1, item1_1, item2 in zip(data1.create_dict_iterator(), data1_1.create_dict_iterator(),
                                     data2.create_dict_iterator()):
        assert np.array_equal(item1['image'], item1_1['image'])
        _ = item2["image"]
    def begin(self, run_context):
        """
        Initialize the training progress when the training job begins.

        Args:
            run_context (RunContext): It contains all lineage information,
                see mindspore.train.callback.RunContext.

        Raises:
            MindInsightException: If validating parameter fails.
        """
        log.info('Initialize training lineage collection...')

        if self.user_defined_info:
            self.lineage_summary.record_user_defined_info(
                self.user_defined_info)

        if not isinstance(run_context, RunContext):
            error_msg = f'Invalid TrainLineage run_context.'
            log.error(error_msg)
            raise LineageParamRunContextError(error_msg)

        run_context_args = run_context.original_args()
        if not self.initial_learning_rate:
            optimizer = run_context_args.get('optimizer')
            if optimizer and not isinstance(optimizer, Optimizer):
                log.error(
                    "The parameter optimizer is invalid. It should be an instance of "
                    "mindspore.nn.optim.optimizer.Optimizer.")
                raise MindInsightException(
                    error=LineageErrors.PARAM_OPTIMIZER_ERROR,
                    message=LineageErrorMsg.PARAM_OPTIMIZER_ERROR.value)
            if optimizer:
                log.info('Obtaining initial learning rate...')
                self.initial_learning_rate = AnalyzeObject.analyze_optimizer(
                    optimizer)
                log.debug('initial_learning_rate: %s',
                          self.initial_learning_rate)
            else:
                network = run_context_args.get('train_network')
                optimizer = AnalyzeObject.get_optimizer_by_network(network)
                self.initial_learning_rate = AnalyzeObject.analyze_optimizer(
                    optimizer)
                log.debug('initial_learning_rate: %s',
                          self.initial_learning_rate)

        # get train dataset graph
        train_dataset = run_context_args.get('train_dataset')
        dataset_graph_dict = ds.serialize(train_dataset)
        dataset_graph_json_str = json.dumps(dataset_graph_dict, indent=2)
        dataset_graph_dict = json.loads(dataset_graph_json_str)
        log.info('Logging dataset graph...')
        try:
            self.lineage_summary.record_dataset_graph(
                dataset_graph=dataset_graph_dict)
        except Exception as error:
            error_msg = f'Dataset graph log error in TrainLineage begin: {error}'
            log.error(error_msg)
            raise LineageLogError(error_msg)
        log.info('Dataset graph logged successfully.')
def test_serdes_exception():
    """
    Test exception case in serdes
    """
    data_dir = [
        "../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"
    ]
    schema_file = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
    data1 = ds.TFRecordDataset(data_dir,
                               schema_file,
                               columns_list=["image", "label"],
                               shuffle=False)
    data1 = data1.filter(input_columns=["image", "label"],
                         predicate=lambda data: data < 11,
                         num_parallel_workers=4)
    data1_json = ds.serialize(data1)
    with pytest.raises(RuntimeError) as msg:
        ds.deserialize(input_dict=data1_json)
    assert "Filter is not yet supported by ds.engine.deserialize" in str(msg)
def test_serdes_random_crop():
    """
    Test serdes on RandomCrop pipeline.
    """
    logger.info("test_random_crop")
    DATA_DIR = [
        "../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"
    ]
    SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
    original_seed = config_get_set_seed(1)
    original_num_parallel_workers = config_get_set_num_parallel_workers(1)

    # First dataset
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"])
    decode_op = vision.Decode()
    random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200])
    data1 = data1.map(operations=decode_op, input_columns="image")
    data1 = data1.map(operations=random_crop_op, input_columns="image")

    # Serializing into python dictionary
    ds1_dict = ds.serialize(data1)
    # Serializing into json object
    _ = json.dumps(ds1_dict, indent=2)

    # Reconstruct dataset pipeline from its serialized form
    data1_1 = ds.deserialize(input_dict=ds1_dict)

    # Second dataset
    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"])
    data2 = data2.map(operations=decode_op, input_columns="image")

    for item1, item1_1, item2 in zip(
            data1.create_dict_iterator(num_epochs=1, output_numpy=True),
            data1_1.create_dict_iterator(num_epochs=1, output_numpy=True),
            data2.create_dict_iterator(num_epochs=1, output_numpy=True)):
        np.testing.assert_array_equal(item1['image'], item1_1['image'])
        _ = item2["image"]

    # Restore configuration num_parallel_workers
    ds.config.set_seed(original_seed)
    ds.config.set_num_parallel_workers(original_num_parallel_workers)
def test_serdes_imagefolder_dataset(remove_json_files=True):
    """
    Test simulating resnet50 dataset pipeline.
    """
    data_dir = "../data/dataset/testPK/data"
    ds.config.set_seed(1)

    # define data augmentation parameters
    rescale = 1.0 / 255.0
    shift = 0.0
    resize_height, resize_width = 224, 224
    weights = [
        1.0, 0.1, 0.02, 0.3, 0.4, 0.05, 1.2, 0.13, 0.14, 0.015, 0.16, 1.1
    ]

    # Constructing DE pipeline
    sampler = ds.WeightedRandomSampler(weights, 11)
    child_sampler = ds.SequentialSampler()
    sampler.add_child(child_sampler)
    data1 = ds.ImageFolderDataset(data_dir, sampler=sampler)
    data1 = data1.repeat(1)
    data1 = data1.map(operations=[vision.Decode(True)],
                      input_columns=["image"])
    rescale_op = vision.Rescale(rescale, shift)

    resize_op = vision.Resize((resize_height, resize_width), Inter.LINEAR)
    data1 = data1.map(operations=[rescale_op, resize_op],
                      input_columns=["image"])
    data1 = data1.batch(2)

    # Serialize the dataset pre-processing pipeline.
    # data1 should still work after saving.
    ds.serialize(data1, "imagenet_dataset_pipeline.json")
    ds1_dict = ds.serialize(data1)
    assert validate_jsonfile("imagenet_dataset_pipeline.json") is True

    # Print the serialized pipeline to stdout
    ds.show(data1)

    # Deserialize the serialized json file
    data2 = ds.deserialize(json_filepath="imagenet_dataset_pipeline.json")

    # Serialize the pipeline we just deserialized.
    # The content of the json file should be the same to the previous serialize.
    ds.serialize(data2, "imagenet_dataset_pipeline_1.json")
    assert validate_jsonfile("imagenet_dataset_pipeline_1.json") is True
    assert filecmp.cmp('imagenet_dataset_pipeline.json',
                       'imagenet_dataset_pipeline_1.json')

    # Deserialize the latest json file again
    data3 = ds.deserialize(json_filepath="imagenet_dataset_pipeline_1.json")
    data4 = ds.deserialize(input_dict=ds1_dict)
    num_samples = 0
    # Iterate and compare the data in the original pipeline (data1) against the deserialized pipeline (data2)
    for item1, item2, item3, item4 in zip(
            data1.create_dict_iterator(num_epochs=1, output_numpy=True),
            data2.create_dict_iterator(num_epochs=1, output_numpy=True),
            data3.create_dict_iterator(num_epochs=1, output_numpy=True),
            data4.create_dict_iterator(num_epochs=1, output_numpy=True)):
        np.testing.assert_array_equal(item1['image'], item2['image'])
        np.testing.assert_array_equal(item1['image'], item3['image'])
        np.testing.assert_array_equal(item1['label'], item2['label'])
        np.testing.assert_array_equal(item1['label'], item3['label'])
        np.testing.assert_array_equal(item3['image'], item4['image'])
        np.testing.assert_array_equal(item3['label'], item4['label'])
        num_samples += 1

    logger.info("Number of data in data1: {}".format(num_samples))
    assert num_samples == 6

    # Remove the generated json file
    if remove_json_files:
        delete_json_files()