def test_multiple_iterators():
    channel, directory = write_to_channel("A", [b"bear"])

    dataset = PipeModeDataset(channel,
                              pipe_dir=directory,
                              state_dir=directory,
                              config_dir=directory)
    with tf.Session() as sess:
        it = dataset.make_one_shot_iterator()
        next = it.get_next()
        assert sess.run(next) == b"bear"
        with pytest.raises(tf.errors.OutOfRangeError):
            sess.run(next)

    with open(os.path.join(directory, channel + "_1"), 'wb') as f:
        write_recordio(f, b"bunny")
        write_recordio(f, b"piano")
        write_recordio(f, b"caterpillar")

    with tf.Session() as sess:
        it = dataset.make_one_shot_iterator()
        next = it.get_next()
        assert b"bunny" == sess.run(next)
        assert b"piano" == sess.run(next)
        assert b"caterpillar" == sess.run(next)
        with pytest.raises(tf.errors.OutOfRangeError):
            sess.run(next)
    def _input_fn():
        def _read_and_decode(record):
            features = tf.parse_single_example(
                record,
                features={
                    'image_raw': tf.FixedLenFeature([], tf.string),
                    'label': tf.FixedLenFeature([], tf.int64),
                })

            image = tf.decode_raw(features['image_raw'], tf.uint8)
            image.set_shape([HEIGHT * WIDTH * DEPTH])
            image = tf.cast(image, tf.float32) * (1. / 255)
            label = tf.cast(features['label'], tf.int32)

            return {INPUT_TENSOR_NAME: image}, label

        ds = PipeModeDataset(channel, record_format='TFRecord')
        ds = ds.repeat()
        ds = ds.prefetch(batch_size)
        ds = ds.map(_read_and_decode, num_parallel_calls=NUM_PARALLEL_BATCHES)

        if channel == 'train':
            ds = ds.shuffle(buffer_size=batch_size)

        ds = ds.batch(batch_size, drop_remainder=True)
        ds = ds.make_one_shot_iterator().get_next()

        return ds
def _input(epochs, batch_size, channel, channel_name):

    filenames = get_filenames(channel_name, channel)
    # ----- 추가 부분 (PipeModeDataSet) -----
    #dataset = tf.data.TFRecordDataset(filenames)
    dataset = PipeModeDataset(channel=channel_name, record_format='TFRecord')    

    dataset = dataset.repeat(epochs)
    dataset = dataset.prefetch(10)

    # Parse records.
    dataset = dataset.map(
        _dataset_parser, num_parallel_calls=10)

    # Potentially shuffle records.
    if channel_name == 'train':
        # Ensure that the capacity is sufficiently large to provide good random
        # shuffling.
        buffer_size = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size
        dataset = dataset.shuffle(buffer_size=buffer_size)

    # Batch it up.
    dataset = dataset.batch(batch_size, drop_remainder=True)
    iterator = dataset.make_one_shot_iterator()
    image_batch, label_batch = iterator.get_next()

    return {INPUT_TENSOR_NAME: image_batch}, label_batch
def test_csv():
    channel_dir = tempfile.mkdtemp()
    state_dir = tempfile.mkdtemp()
    epochs = 1
    channel_name = 'testchannel'
    write_config(channel_dir, 'testchannel')

    create_fifos(epochs, channel_dir, channel_name, input_file='test.csv')

    def parse(line):
        fields = tf.decode_csv(line, FIELD_DEFAULTS)
        features = dict(zip(COLUMNS, fields))
        return features

    with tf.Session() as sess:
        ds = PipeModeDataset(channel_name, pipe_dir=channel_dir, state_dir=state_dir, config_dir=channel_dir,
                             record_format='TextLine')
        ds = ds.map(parse)

        it = ds.make_one_shot_iterator()
        next = it.get_next()
        for i in range(100):
            d = sess.run(next)
            sys.stdout.flush()
            assert d == {str(i): i for i in range(100)}
 def input_fn():
     ds = PipeModeDataset(channel_name, pipe_dir=channel_dir, state_dir=state_dir, config_dir=channel_dir)
     ds = ds.map(parse, num_parallel_calls=12)
     ds = ds.prefetch(3)
     ds = ds.batch(3)
     it = ds.make_one_shot_iterator()
     return it.get_next()
def test_single_record():
    channel, directory = write_to_channel("A", [b"bear"])
    with tf.Session() as sess:
        dataset = PipeModeDataset(channel, pipe_dir=directory, state_dir=directory, config_dir=directory)
        it = dataset.make_one_shot_iterator()
        next = it.get_next()
        assert b"bear" == sess.run(next)
Beispiel #7
0
def process_input(epochs, batch_size, channel, channel_name, data_config):

    mode = data_config[channel_name]['TrainingInputMode']
    filenames = _get_filenames(channel_name, channel)
    # Repeat infinitely.
    logging.info("Running {} in {} mode".format(channel_name, mode))
    if mode == 'Pipe':
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel_name,
                                  record_format='TFRecord')
    else:
        dataset = tf.data.TFRecordDataset(filenames)

    dataset = dataset.repeat(epochs)
    dataset = dataset.prefetch(10)

    # Parse records.
    dataset = dataset.map(_dataset_parser, num_parallel_calls=10)

    # Potentially shuffle records.
    if channel_name == 'train':
        # Ensure that the capacity is sufficiently large to provide good random
        # shuffling.
        buffer_size = int(
            NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size
        dataset = dataset.shuffle(buffer_size=buffer_size)

    # Batch it up.
    dataset = dataset.batch(batch_size, drop_remainder=True)
    iterator = dataset.make_one_shot_iterator()
    image_batch, label_batch = iterator.get_next()

    return image_batch, label_batch
def test_multiple_records():
    channel, directory = write_to_channel("B", [b"bunny", b"caterpillar"])
    with tf.Session() as sess:
        dataset = PipeModeDataset(channel, pipe_dir=directory, state_dir=directory, config_dir=directory)
        it = dataset.make_one_shot_iterator()
        next = it.get_next()
        assert b"bunny" == sess.run(next)
        assert b"caterpillar" == sess.run(next)
def test_benchmark_records_interval_disabled(capfd):
    channel, directory = write_to_channel("A", [b"bear"])

    dataset = PipeModeDataset(channel, pipe_dir=directory, state_dir=directory, config_dir=directory, benchmark_records_interval=0)
    with tf.Session() as sess:
        it = dataset.make_one_shot_iterator()
        next = it.get_next()
        assert sess.run(next) == b"bear"
        out, err = capfd.readouterr()
        assert 'Iterator records' not in out
def test_out_of_range():
    channel, directory = write_to_channel("A", [b"bear", b"bunny", b"truck"])
    with tf.Session() as sess:
        dataset = PipeModeDataset(channel, pipe_dir=directory, state_dir=directory, config_dir=directory)
        it = dataset.make_one_shot_iterator()
        next = it.get_next()
        for i in range(3):
            sess.run(next)
        with pytest.raises(tf.errors.OutOfRangeError):
            sess.run(next)
def test_large_record():
    channel, directory = write_to_channel("C", [b"a" * 1000000])

    with tf.Session() as sess:
        dataset = PipeModeDataset(channel,
                                  pipe_dir=directory,
                                  state_dir=directory)
        it = dataset.make_one_shot_iterator()
        next = it.get_next()
        assert b"a" * 1000000 == sess.run(next)
Beispiel #12
0
def _input(epochs, batch_size, channel, channel_name):
    mode = args.data_config[channel_name]['TrainingInputMode']
    """Uses the tf.data input pipeline for our dataset.
    Args:
        mode: Standard names for model modes (tf.estimators.ModeKeys).
        batch_size: The number of samples per batch of input requested.
    """
    logging.info("Running {} in {} mode for {} epochs".format(channel_name, mode, epochs))

    filenames = get_filenames(channel_name, channel)

    # Repeat infinitely.
    if mode == 'Pipe':
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel_name, record_format='TFRecord')
    else:
        dataset = tf.data.TFRecordDataset(filenames)

    dataset = dataset.repeat(epochs)
    dataset = dataset.prefetch(batch_size)

    # Parse records.
    dataset = dataset.map(_dataset_parser, num_parallel_calls=10)

    # Shuffle training records.
    if channel_name == 'train':
        # Ensure that the capacity is sufficiently large to provide good random
        # shuffling.
        buffer_size = args.num_train_samples // args.batch_size
        dataset = dataset.shuffle(buffer_size=buffer_size)

    # Batch it up.
    dataset = dataset.batch(batch_size, drop_remainder=True)

    if tf.version.VERSION[0] == '2':
        iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
    else:
        iterator = dataset.make_one_shot_iterator()
        
    features_batch, label_batch = iterator.get_next()
    
    if tf.version.VERSION[0] == '2':
        with tf.compat.v1.Session() as sess:
            logging.info('type of features_batch: {}, type of values: {}'.format(type(features_batch), 
                                                             type(features_batch)))
            logging.info('label_batch: {}'.format(label_batch))
            logging.info('type of label_batch: {}'.format(type(label_batch)))
    else:
        with tf.Session() as sess:
            logging.info('type of features_batch: {}, type of values: {}'.format(type(features_batch), 
                                                             type(features_batch)))
            logging.info('label_batch: {}'.format(label_batch))
            logging.info('type of label_batch: {}'.format(type(label_batch)))

    return {INPUT_TENSOR_NAME: features_batch}, label_batch
def test_invalid_data():
    directory = tempfile.mkdtemp()
    filename = "X_0"
    with open(directory + "/" + filename, 'wb') as f:
        f.write(b"adfsafasfd")
    write_config(directory, 'X')
    dataset = PipeModeDataset("X", pipe_dir=directory, state_dir=directory, config_dir=directory)
    with pytest.raises(tf.errors.InternalError):
        with tf.Session() as sess:
            it = dataset.make_one_shot_iterator()
            next = it.get_next()
            sess.run(next)
def test_tf_record():
    channel_dir = tempfile.mkdtemp()
    state_dir = tempfile.mkdtemp()
    epochs = 1
    channel_name = 'testchannel'
    create_fifos(epochs, channel_dir, channel_name, input_file='test.tfrecords')
    write_config(channel_dir, 'testchannel')

    ds = PipeModeDataset(channel_name, pipe_dir=channel_dir, state_dir=state_dir, config_dir=channel_dir,
                         record_format='TFRecord')

    with tf.Session() as sess:
        it = ds.make_one_shot_iterator()
        next = it.get_next()
        for i in range(100):
            assert sess.run(next) == b'hello world'
Beispiel #15
0
def _input(epochs, batch_size, channel, channel_name):
    mode = args.data_config[channel_name]['TrainingInputMode']
    """Uses the tf.data input pipeline for CIFAR-10 dataset.
    Args:
        mode: Standard names for model modes (tf.estimators.ModeKeys).
        batch_size: The number of samples per batch of input requested.
    """
    filenames = get_filenames(channel_name, channel)
    # Repeat infinitely.
    logging.info("Running {} in {} mode".format(channel_name, mode))
    if mode == 'Pipe':
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel_name,
                                  record_format='TFRecord')
    else:
        dataset = tf.data.TFRecordDataset(filenames)

    dataset = dataset.repeat(epochs)
    dataset = dataset.prefetch(10)

    # Parse records.
    dataset = dataset.map(_dataset_parser, num_parallel_calls=10)

    # Potentially shuffle records.
    if channel_name == 'train':
        # Ensure that the capacity is sufficiently large to provide good random
        # shuffling.
        buffer_size = int(
            NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * 0.4) + 3 * batch_size
        dataset = dataset.shuffle(buffer_size=buffer_size)

    # Batch it up.
    dataset = dataset.batch(batch_size, drop_remainder=True)
    iterator = dataset.make_one_shot_iterator()
    image_batch, label_batch = iterator.get_next()

    return {INPUT_TENSOR_NAME: image_batch}, label_batch
# Perform Estimator training
column = tf.feature_column.numeric_column('data', shape=(config.dimension, ))
model_dir = tempfile.mkdtemp()
estimator = tf.estimator.LinearClassifier(feature_columns=[column])

estimator.train(input_fn=input_fn)

# Confirm that we have read the correct number of pipes
assert os.path.exists('/opt/ml/input/data/{}_{}'.format(
    config.channel, config.epochs + 1))

# Test that we can create a new PipeModeDataset after training has run
ds = PipeModeDataset(config.channel)

with tf.Session() as sess:
    it = ds.make_one_shot_iterator()
    next = it.get_next()
    sess.run(next)

# Test that we can create a PipeModeDataset, discard it, and read from a new one
ds = PipeModeDataset(config.channel)
with tf.Session() as sess:
    it = ds.make_one_shot_iterator()
    next = it.get_next()
ds = PipeModeDataset(config.channel)
with tf.Session() as sess:
    it = ds.make_one_shot_iterator()
    next = it.get_next()
    sess.run(next)