def tensorflow_hello_world(dataset_url='file:///tmp/carbon_pycarbon_dataset/'): # Example: tf_tensors will return tensors with dataset data with make_carbon_reader(dataset_url) as reader: tensor = tf_tensors(reader) with tf.Session() as sess: sample = sess.run(tensor) print(sample.id) with make_reader(dataset_url, is_batch=False) as reader: tensor = make_tensor(reader) with tf.Session() as sess: sample = sess.run(tensor) print(sample.id) # Example: use tf.data.Dataset API with make_carbon_reader(dataset_url) as reader: dataset = make_pycarbon_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: sample = sess.run(tensor) print(sample.id) with make_reader(dataset_url, is_batch=False) as reader: dataset = make_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: sample = sess.run(tensor) print(sample.id)
def tensorflow_hello_world(dataset_url='file:///tmp/carbon_external_dataset'): # Example: tf_tensors will return tensors with dataset data with make_batch_carbon_reader(dataset_url) as reader: tensor = tf_tensors(reader) with tf.Session() as sess: # Because we are using make_batch_carbon_reader(), each read returns a batch of rows instead of a single row batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id)) with make_reader(dataset_url) as reader: tensor = make_tensor(reader) with tf.Session() as sess: # Because we are using make_batch_carbon_reader(), each read returns a batch of rows instead of a single row batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id)) # Example: use tf.data.Dataset API with make_batch_carbon_reader(dataset_url) as reader: dataset = make_pycarbon_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id)) with make_reader(dataset_url) as reader: dataset = make_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id))
def _read_from_tf_tensors(synthetic_dataset, count, shuffling_queue_capacity, min_after_dequeue, ngram, workers_count=10): """Used by several test cases. Reads a 'count' rows using reader. The reader is configured without row-group shuffling and guarantees deterministic order of rows up to the results queue TF shuffling which is controlled by 'shuffling_queue_capacity', 'min_after_dequeue' arguments. The function returns a tuple with: (actual data read from the dataset, a TF tensor returned by the reader) """ schema_fields = (NON_NULLABLE_FIELDS if ngram is None else ngram) with make_carbon_reader(schema_fields=schema_fields, dataset_url=synthetic_dataset.url, reader_pool_type='thread', shuffle_blocklets=False, workers_count=workers_count) as reader: row_tensors = tf_tensors( reader, shuffling_queue_capacity=shuffling_queue_capacity, min_after_dequeue=min_after_dequeue) with _tf_session() as sess: rows_data = [sess.run(row_tensors) for _ in range(count)] return rows_data, row_tensors
def test_simple_read_tensorflow(carbon_synthetic_dataset): """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference data""" with make_carbon_reader( schema_fields=NON_NULLABLE_FIELDS, dataset_url=carbon_synthetic_dataset.url) as reader: row_tensors = tf_tensors(reader) with _tf_session() as sess: rows_data = [sess.run(row_tensors) for _ in range(30)] # Make sure we have static shape info for all fields _assert_all_tensors_have_shape(row_tensors) _assert_expected_rows_data(carbon_synthetic_dataset.data, rows_data)
def test_simple_read_tensorflow_with_non_unischema_many_columns_dataset( carbon_many_columns_non_unischema_dataset): """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference data""" with make_batch_carbon_reader( dataset_url=carbon_many_columns_non_unischema_dataset.url ) as reader: row_tensors = tf_tensors(reader) # Make sure we have static shape info for all fields for column in row_tensors: assert column.get_shape().as_list() == [None] with _tf_session() as sess: batch = sess.run(row_tensors)._asdict() assert set(batch.keys()) == set( carbon_many_columns_non_unischema_dataset.data[0].keys())
def test_simple_read_tensorflow_with_carbon_dataset(carbon_scalar_dataset): """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference data""" with make_batch_carbon_reader( dataset_url=carbon_scalar_dataset.url) as reader: row_tensors = tf_tensors(reader) # Make sure we have static shape info for all fields for column in row_tensors: assert column.get_shape().as_list() == [None] with _tf_session() as sess: for _ in range(2): batch = sess.run(row_tensors)._asdict() for i, id_value in enumerate(batch['id']): expected_row = next(d for d in carbon_scalar_dataset.data if d['id'] == id_value) for field_name in expected_row.keys(): _assert_fields_eq(batch[field_name][i], expected_row[field_name])
def train_and_test(dataset_url, training_iterations, batch_size, evaluation_interval, start): """ Train a model for training iterations with a batch size batch_size, printing accuracy every log_interval. :param dataset_url: The MNIST dataset url. :param training_iterations: The training iterations to train for. :param batch_size: The batch size for training. :param evaluation_interval: The interval used to print the accuracy. :return: """ with make_carbon_reader(os.path.join(dataset_url, 'train'), num_epochs=None) as train_reader: with make_carbon_reader(os.path.join(dataset_url, 'test'), num_epochs=None) as test_reader: train_readout = tf_tensors(train_reader) train_image = tf.cast(tf.reshape(train_readout.image, [784]), tf.float32) train_label = train_readout.digit batch_image, batch_label = tf.train.batch( [train_image, train_label], batch_size=batch_size) W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) y = tf.matmul(batch_image, W) + b # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), # reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.losses.sparse_softmax_cross_entropy on the raw # outputs of 'y', and then average across the batch. cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=batch_label, logits=y) train_step = tf.train.GradientDescentOptimizer(0.5).minimize( cross_entropy) correct_prediction = tf.equal(tf.argmax(y, 1), batch_label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) test_readout = tf_tensors(test_reader) test_image = tf.cast(tf.reshape(test_readout.image, [784]), tf.float32) test_label = test_readout.digit test_batch_image, test_batch_label = tf.train.batch( [test_image, test_label], batch_size=batch_size) end = time.time() print("before train time: " + str(end - start)) # Train print( 'Training model for {0} training iterations with batch size {1} and evaluation interval {2}' .format(training_iterations, batch_size, evaluation_interval)) with tf.Session() as sess: sess.run([ tf.local_variables_initializer(), tf.global_variables_initializer(), ]) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for i in range(training_iterations): if coord.should_stop(): break sess.run(train_step) if (i % evaluation_interval) == 0 or i == ( training_iterations - 1): feed_batch_image, feed_batch_label = sess.run( [test_batch_image, test_batch_label]) print( 'After {0} training iterations, the accuracy of the model is: {1:.2f}' .format( i, sess.run(accuracy, feed_dict={ batch_image: feed_batch_image, batch_label: feed_batch_label }))) finally: coord.request_stop() coord.join(threads)
def test_shuffling_queue_with_make_batch_carbon_reader(carbon_scalar_dataset): with make_batch_carbon_reader( dataset_url=carbon_scalar_dataset.url) as reader: with pytest.raises(ValueError): tf_tensors(reader, 100, 90)