def testFromCSVWithFeatureSpec(self): if not HAS_PANDAS: return num_batches = 100 batch_size = 8 data_path = _make_test_csv_sparse() feature_spec = { "int": tf.FixedLenFeature(None, dtypes.int16, np.nan), "float": tf.VarLenFeature(dtypes.float16), "bool": tf.VarLenFeature(dtypes.bool), "string": tf.FixedLenFeature(None, dtypes.string, "") } pandas_df = pd.read_csv(data_path, dtype={"string": object}) # Pandas insanely uses NaN for empty cells in a string column. # And, we can't use Pandas replace() to fix them because nan != nan s = pandas_df["string"] for i in range(0, len(s)): if isinstance(s[i], float) and math.isnan(s[i]): s[i] = "" tensorflow_df = df.TensorFlowDataFrame.from_csv_with_feature_spec( [data_path], batch_size=batch_size, shuffle=False, feature_spec=feature_spec) # These columns were sparse; re-densify them for comparison tensorflow_df["float"] = densify.Densify(np.nan)(tensorflow_df["float"]) tensorflow_df["bool"] = densify.Densify(np.nan)(tensorflow_df["bool"]) self._assert_pandas_equals_tensorflow(pandas_df, tensorflow_df, num_batches=num_batches, batch_size=batch_size)
def testFromCSVWithFeatureSpec(self): if not HAS_PANDAS: return num_batches = 100 batch_size = 8 data_path = _make_test_csv_sparse() feature_spec = { "int": tf.FixedLenFeature(None, dtypes.int16, np.nan), "float": tf.VarLenFeature(dtypes.float16), "bool": tf.VarLenFeature(dtypes.bool) } pandas_df = pd.read_csv(data_path) tensorflow_df = df.TensorFlowDataFrame.from_csv_with_feature_spec( [data_path], batch_size=batch_size, shuffle=False, feature_spec=feature_spec) # These columns were sparse; re-densify them for comparison tensorflow_df["float"] = densify.Densify(np.nan)( tensorflow_df["float"]) tensorflow_df["bool"] = densify.Densify(np.nan)(tensorflow_df["bool"]) self._assert_pandas_equals_tensorflow(pandas_df, tensorflow_df, num_batches=num_batches, batch_size=batch_size)
def _test_sparsify_densify(self, x, default_value): """Test roundtrip via Sparsify and Densify.""" numpy_source = in_memory_source.NumpySource(x, batch_size=len(x))() (sparse_series, ) = sparsify.Sparsify(default_value)(numpy_source[1]) (dense_series, ) = densify.Densify(default_value)(sparse_series) cache = {} sparse_tensor = sparse_series.build(cache) dense_tensor = dense_series.build(cache) with self.test_session() as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) sparse_val, dense_val = sess.run([sparse_tensor, dense_tensor]) coord.request_stop() coord.join(threads) if x.dtype.kind not in ["S", "U"] and np.isnan(default_value): x_values = x[~np.isnan(x)] x_indexes = np.arange(len(x))[~np.isnan(x)].T.reshape(-1, 1) else: x_values = x[x != default_value] x_indexes = np.arange(len(x))[x != default_value].T.reshape(-1, 1) if x.dtype.kind in ["S", "U"]: # Python 2/3 compatibility # TensorFlow always returns bytes, so we just convert the unicode # expectations to bytes also before comparing. expected_x = [item.encode("utf-8") for item in x] expected_x_values = [item.encode("utf-8") for item in x_values] else: expected_x = x expected_x_values = x_values np.testing.assert_array_equal(len(x), sparse_val.shape[0]) np.testing.assert_array_equal(expected_x_values, sparse_val.values) np.testing.assert_array_equal(x_indexes, sparse_val.indices) np.testing.assert_array_equal(expected_x, dense_val)