Beispiel #1
0
def test_some_processing_functions(carbon_synthetic_dataset, reader_factory):
    """Try several ``tf.data.Dataset`` dataset operations on make_pycarbon_dataset"""

    # reader1 will have a single row with id=1, reader2: a single row with id=2

    # Using functools.partial(_eq, 1)) which is equivalent to lambda x: x==1 because standard python pickle
    # can not pickle this lambda
    with reader_factory(carbon_synthetic_dataset.url,
                        predicate=in_lambda(['id'],
                                            functools.partial(operator.eq,
                                                              1))) as reader1:
        with reader_factory(carbon_synthetic_dataset.url,
                            predicate=in_lambda(['id'],
                                                functools.partial(
                                                    operator.eq,
                                                    2))) as reader2:
            dataset = make_pycarbon_dataset(reader1) \
              .prefetch(10) \
              .concatenate(make_pycarbon_dataset(reader2)) \
              .map(lambda x: x.id) \
              .batch(2)

            next_sample = dataset.make_one_shot_iterator().get_next()

            with tf.Session() as sess:
                # 'actual' is expected to be content of id column of a concatenated dataset
                actual = sess.run(next_sample)
                np.testing.assert_array_equal(actual, [1, 2])
Beispiel #2
0
def test_dataset_on_ngram_not_supported(carbon_synthetic_dataset,
                                        reader_factory):
    ngram = NGram({
        0: list(_EXCLUDE_FIELDS),
        1: [TestSchema.id]
    }, 100, TestSchema.id)
    with reader_factory(carbon_synthetic_dataset.url,
                        schema_fields=ngram) as reader:
        with pytest.raises(NotImplementedError):
            make_pycarbon_dataset(reader)
def tensorflow_hello_world(dataset_url='file:///tmp/carbon_pycarbon_dataset/'):
    # Example: tf_tensors will return tensors with dataset data
    with make_carbon_reader(dataset_url) as reader:
        tensor = tf_tensors(reader)
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)

    with make_reader(dataset_url, is_batch=False) as reader:
        tensor = make_tensor(reader)
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)

    # Example: use tf.data.Dataset API
    with make_carbon_reader(dataset_url) as reader:
        dataset = make_pycarbon_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)

    with make_reader(dataset_url, is_batch=False) as reader:
        dataset = make_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)
def tensorflow_hello_world(dataset_url='file:///tmp/carbon_external_dataset'):
    # Example: tf_tensors will return tensors with dataset data
    with make_batch_carbon_reader(dataset_url) as reader:
        tensor = tf_tensors(reader)
        with tf.Session() as sess:
            # Because we are using make_batch_carbon_reader(), each read returns a batch of rows instead of a single row
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))

    with make_reader(dataset_url) as reader:
        tensor = make_tensor(reader)
        with tf.Session() as sess:
            # Because we are using make_batch_carbon_reader(), each read returns a batch of rows instead of a single row
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))

    # Example: use tf.data.Dataset API
    with make_batch_carbon_reader(dataset_url) as reader:
        dataset = make_pycarbon_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))

    with make_reader(dataset_url) as reader:
        dataset = make_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))
Beispiel #5
0
def test_with_one_shot_iterator(carbon_synthetic_dataset, reader_factory):
    """Just a bunch of read and compares of all values to the expected values"""
    with reader_factory(carbon_synthetic_dataset.url) as reader:
        dataset = make_pycarbon_dataset(reader)
        iterator = dataset.make_one_shot_iterator()

        # Make sure we have static shape info for all fields
        for shape in dataset.output_shapes:
            # TODO(yevgeni): check that the shapes are actually correct, not just not None
            assert shape.dims is not None

        # Read a bunch of entries from the dataset and compare the data to reference
        with tf.Session() as sess:
            iterator = iterator.get_next()
            for _, _ in enumerate(carbon_synthetic_dataset.data):
                actual = sess.run(iterator)._asdict()
                expected = next(d for d in carbon_synthetic_dataset.data
                                if d['id'] == actual['id'])
                for key in actual.keys():
                    if isinstance(expected[key], str):
                        # Tensorflow returns all strings as bytes in python3. So we will need to decode it
                        actual_value = actual[key].decode()
                    elif isinstance(
                            expected[key], np.ndarray
                    ) and expected[key].dtype.type == np.unicode_:
                        actual_value = np.array(
                            [item.decode() for item in actual[key]])
                    else:
                        actual_value = actual[key]

                    np.testing.assert_equal(actual_value, expected[key])

            # Exhausted one full epoch. Fetching next value should trigger OutOfRangeError
            with pytest.raises(tf.errors.OutOfRangeError):
                sess.run(iterator)
Beispiel #6
0
def test_dataset_batch_carbon_reader(carbon_scalar_dataset):
    with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                  num_epochs=1) as reader:
        dataset = make_pycarbon_dataset(reader) \
          .apply(tf.data.experimental.unbatch()) \
          .batch(batch_size=1)

        iterator = dataset.make_one_shot_iterator()

        tensor = iterator.get_next()

        with tf.Session() as sess:
            sess.run([
                tf.local_variables_initializer(),
                tf.global_variables_initializer(),
            ])
            i = 0
            try:
                while True:
                    sess.run(tensor)
                    i += 1
            except tf.errors.OutOfRangeError:
                print("Finish! the number is " + str(i))

            assert i == _ROWS_COUNT
Beispiel #7
0
def test_non_unischema_with_many_colums_with_one_shot_iterator(
        carbon_many_columns_non_unischema_dataset):
    """Just a bunch of read and compares of all values to the expected values"""
    with make_batch_carbon_reader(
            carbon_many_columns_non_unischema_dataset.url,
            workers_count=1) as reader:
        dataset = make_pycarbon_dataset(reader)
        iterator = dataset.make_one_shot_iterator()

        # Make sure we have static shape info for all fields
        for shape in dataset.output_shapes:
            # TODO(yevgeni): check that the shapes are actually correct, not just not None
            assert shape.dims is not None

        # Read a bunch of entries from the dataset and compare the data to reference
        with tf.Session() as sess:
            iterator = iterator.get_next()
            sample = sess.run(iterator)._asdict()
            assert set(sample.keys()) == set(
                carbon_many_columns_non_unischema_dataset.data[0].keys())
Beispiel #8
0
def test_with_dataset_repeat(carbon_synthetic_dataset, reader_factory):
    """``tf.data.Dataset``'s ``repeat`` should not be used on ``make_pycarbon_dataset`` due to high costs of
  ``Reader initialization``. A user should use ``Reader`` built-in epochs support. Check that we raise an
  error to alert of misuse."""
    with reader_factory(carbon_synthetic_dataset.url) as reader:
        dataset = make_pycarbon_dataset(reader)

        dataset = dataset.repeat(2)

        iterator = dataset.make_one_shot_iterator()

        # Read a bunch of entries from the dataset and compare the data to reference
        with tf.Session() as sess:
            iterator = iterator.get_next()

            for _, _ in enumerate(carbon_synthetic_dataset.data):
                sess.run(iterator)

            with pytest.raises(tf.errors.UnknownError,
                               match=r'.*Multiple iterations.*'):
                sess.run(iterator)
Beispiel #9
0
def test_dynamic_batch_size_of_carbon_reader(carbon_synthetic_dataset):
    with make_carbon_reader(carbon_synthetic_dataset.url,
                            num_epochs=None) as reader:
        batch_size = tf.data.Dataset.range(
            1, 10).make_one_shot_iterator().get_next()

        dataset = make_pycarbon_dataset(reader) \
          .batch(batch_size=batch_size)

        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer

        tensor = iterator.get_next()

        with tf.train.MonitoredTrainingSession() as sess:
            sess.run(init_op)
            sample = sess.run(tensor)
            assert 1 == len(sample.id)

            sess.run(init_op)
            sample = sess.run(tensor)
            assert 2 == len(sample.id)
def train_and_test(train_dataset_url, test_dataset_url, num_epochs, batch_size,
                   evaluation_interval):
    """
  Train a model for training iterations with a batch size batch_size, printing accuracy every log_interval.
  :param dataset_url: The MNIST dataset url.
  :param num_epochs: The number of epochs to train for.
  :param batch_size: The batch size for training.
  :param evaluation_interval: The interval used to print the accuracy.
  :return:
  """

    with make_batch_carbon_reader(os.path.join(train_dataset_url),
                                  num_epochs=num_epochs) as train_reader:
        with make_batch_carbon_reader(os.path.join(test_dataset_url),
                                      num_epochs=num_epochs) as test_reader:
            # Create the model
            x = tf.placeholder(tf.float32, [None, 784])
            w = tf.Variable(tf.zeros([784, 10]))
            b = tf.Variable(tf.zeros([10]))
            y = tf.matmul(x, w) + b

            # Define loss and optimizer
            y_ = tf.placeholder(tf.int64, [None])

            # Define the loss function
            cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_,
                                                                   logits=y)

            train_step = tf.train.GradientDescentOptimizer(0.5).minimize(
                cross_entropy)

            correct_prediction = tf.equal(tf.argmax(y, 1), y_)

            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

            train_dataset = make_pycarbon_dataset(train_reader) \
              .apply(tf.data.experimental.unbatch()) \
              .batch(batch_size) \
              .map(decode)

            train_iterator = train_dataset.make_one_shot_iterator()
            label, image = train_iterator.get_next()

            test_dataset = make_pycarbon_dataset(test_reader) \
              .apply(tf.data.experimental.unbatch()) \
              .batch(batch_size) \
              .map(decode)

            test_iterator = test_dataset.make_one_shot_iterator()
            test_label, test_image = test_iterator.get_next()

            # Train
            print(
                'Training model for {0} epoch with batch size {1} and evaluation interval {2}'
                .format(num_epochs, batch_size, evaluation_interval))

            i = 0
            with tf.Session() as sess:
                sess.run([
                    tf.local_variables_initializer(),
                    tf.global_variables_initializer(),
                ])

                try:
                    while True:
                        cur_label, cur_image = sess.run([label, image])

                        sess.run([train_step],
                                 feed_dict={
                                     x: cur_image,
                                     y_: cur_label
                                 })

                        if i % evaluation_interval == 0:
                            test_cur_label, test_cur_image = sess.run(
                                [test_label, test_image])
                            print(
                                'After {0} training iterations, the accuracy of the model is: {1:.2f}'
                                .format(
                                    i,
                                    sess.run(accuracy,
                                             feed_dict={
                                                 x: test_cur_image,
                                                 y_: test_cur_label
                                             })))
                        i += 1

                except tf.errors.OutOfRangeError:
                    print("Finish! the number is " + str(i))