def _transform(self, dataset): graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [ tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping ] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = { self._getSparkDlOpName(tnsr_name): col_name for col_name, tnsr_name in input_mapping } fetches = [ tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names ] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
def residual_of_closest(df, feature_column, residual_column, centers, assigned_column='assigned'): """ Residual between points and their closest center :param df: dataframe, contains all points :param feature_column: string, the points column within df :param residual_column: string, the output column name for residual error between closest center. :param centers: numpy.array, [num_centroids, num_features] the k cluster centers :param assigned_column: string, the output column name for index of closest center. :return: dataframe, contains two extra columns, `residual_column`, `assigned_column` """ df = tfs.analyze(df) num_features = centers.shape[1] with tf.Graph().as_default(): points = tf.placeholder(tf.double, shape=[None, num_features], name=feature_column) assigned = _assign_center(points, centers) residual = _residual_of_assigned(points, assigned, centers, residual_column) return tfs.map_blocks([assigned, residual], df)
def _transform(self, dataset): if any([field.dataType == DoubleType() for field in dataset.schema]): logger.warning("Detected DoubleType columns in dataframe passed to transform(). In " "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be " "fed to input tensors of type tf.float64. To feed dataframe data to " "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the " "corresponding Spark SQL data types (FloatType, IntegerType, LongType).") graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping} fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
def simple_example_2(): spark = SparkSession.builder.appName( 'simple-tensorframes-example-2').getOrCreate() spark.sparkContext.setLogLevel('WARN') rdd = [Row(y=[float(y), float(-y)]) for y in range(10)] df = spark.createDataFrame(rdd) df.show() tfs.print_schema(df) # Analyze first to find the dimensions of the vectors. df2 = tfs.analyze(df) tfs.print_schema(df2) # Make a copy of the 'y' column: An inexpensive operation in Spark 2.0+. df3 = df2.select(df2.y, df2.y.alias('z')) # Execute the tensor graph. with tf.Graph().as_default() as graph: y_input = tfs.block(df3, 'y', tf_name='y_input') z_input = tfs.block(df3, 'z', tf_name='z_input') # Perform elementwise sum and minimum. y = tf.reduce_sum(y_input, [0], name='y') z = tf.reduce_min(z_input, [0], name='z') (data_sum, data_min) = tfs.reduce_blocks([y, z], df3) print('Elementwise sum: %s and minimum: %s' % (data_sum, data_min))
def _transform(self, dataset): if len([field for field in dataset.schema if field.dataType == DoubleType()]) > 0: logger.warn("Detected DoubleType columns in dataframe passed to transform(). In " "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be " "fed to input tensors of type tf.float64. To feed dataframe data to " "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the " "corresponding Spark SQL data types (FloatType, IntegerType, LongType).") graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping} fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
def tf_serving_with_dataframe(df, model_base_path, model_version=None): """ :param df: spark dataframe, batch input for the model :param model_base_path: str, tensorflow saved Model model base path :param model_version: int, tensorflow saved Model model version, default None :return: spark dataframe, with predicted result. """ import tensorframes as tfs g, feed_tensors, fetch_tensors = load_model(model_base_path, model_version) with g.as_default(): df = rename_by_mapping(df, feed_tensors) df = tfs.analyze(df) df = tfs.map_blocks(fetch_tensors.values(), df) df = rename_by_mapping(df, feed_tensors, reverse=True) return rename_by_mapping(df, fetch_tensors, reverse=True)
def _check_transformer_output(transformer, dataset, expected): """ Given a transformer and a spark dataset, check if the transformer produces the expected results. """ analyzed_df = tfs.analyze(dataset) out_df = transformer.transform(analyzed_df) # Collect transformed values out_colnames = list(_output_mapping.values()) _results = [] for row in out_df.select(out_colnames).collect(): curr_res = [row[colname] for colname in out_colnames] _results.append(np.ravel(curr_res)) out_tgt = np.hstack(_results) _err_msg = 'not close => shape {} != {}, max_diff {} > {}' max_diff = np.max(np.abs(expected - out_tgt)) err_msg = _err_msg.format(expected.shape, out_tgt.shape, max_diff, _all_close_tolerance) assert np.allclose(expected, out_tgt, atol=_all_close_tolerance), err_msg
import tensorflow as tf import tensorframes as tfs from pyspark.mllib.random import RandomRDDs import numpy as np num_features = 4 k = 2 # TODO: does not work with 1 data = RandomRDDs.normalVectorRDD(sc, numCols=num_features, numRows=100, seed=1).map(lambda v: [v.tolist()]) df = sqlContext.createDataFrame(data).toDF("features") # For now, analysis is still required. df0 = tfs.analyze(df) init_centers = np.random.randn(k, num_features) # For debugging block = np.array(data.take(10))[::, 0, ::] # Find the distances first with tf.Graph().as_default() as g: points = tf.placeholder(tf.double, shape=[None, num_features], name='points') num_points = tf.shape(points)[0] #centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers') centers = tf.constant(init_centers) squares = tf.reduce_sum(tf.square(points), reduction_indices=1)
import tensorframes as tfs from pyspark.mllib.random import RandomRDDs import numpy as np num_features = 4 k = 2 # TODO: does not work with 1 data = RandomRDDs.normalVectorRDD( sc, numCols=num_features, numRows=100, seed=1).map(lambda v: [v.tolist()]) df = sqlContext.createDataFrame(data).toDF("features") # For now, analysis is still required. df0 = tfs.analyze(df) init_centers = np.random.randn(k, num_features) # For debugging block = np.array(data.take(10))[::,0,::] # Find the distances first with tf.Graph().as_default() as g: points = tf.placeholder(tf.double, shape=[None, num_features], name='points') num_points = tf.shape(points)[0] #centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers') centers = tf.constant(init_centers) squares = tf.reduce_sum(tf.square(points), reduction_indices=1) center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1) prods = tf.matmul(points, centers, transpose_b = True)
import tensorflow as tf import tensorframes as tfs from pyspark.sql import Row # Build a DataFrame of vectors data = [Row(y=[float(y), float(-y)]) for y in range(10)] df = sqlContext.createDataFrame(data) # Because the dataframe contains vectors, we need to analyze it first to find the # dimensions of the vectors. df2 = tfs.analyze(df) # The information gathered by TF can be printed to check the content: tfs.print_schema(df2) # TF has inferred that y contains vectors of size 2 # root # |-- y: array (nullable = false) DoubleType[?,2] # Let's use the analyzed dataframe to compute the sum and the elementwise minimum # of all the vectors: # First, let's make a copy of the 'y' column. This will be very cheap in Spark 2.0+ df3 = df2.select(df2.y, df2.y.alias("z")) with tf.Graph().as_default() as g: # The placeholders. Note the special name that end with '_input': y_input = tfs.block(df3, 'y', tf_name="y_input") z_input = tfs.block(df3, 'z', tf_name="z_input") y = tf.reduce_sum(y_input, [0], name='y') z = tf.reduce_min(z_input, [0], name='z') # The resulting dataframe (data_sum, data_min) = tfs.reduce_blocks([y, z], df3) # The final results are numpy arrays:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) import tensorframes as tfs import tensorflow as tf from pyspark.sql import Row from pyspark.sql.functions import * from pyspark.sql.types import DoubleType, IntegerType, LongType, FloatType from tensorframes.core import _java_api japi = _java_api() _java_api().initialize_logging() # The input data data = [Row(x=[float(x), float(2 * x)], key=str(x % 2)) for x in range(1, 6)] df = sqlContext.createDataFrame(data) df = tfs.analyze(sqlContext.createDataFrame(data)) # The geometric mean: # TODO(tjh) make a test out of this, it found some bugs # - non numeric columns (string) # - unused columns # - output that has a child col_name = "x" col_key = "key" with tf.Graph().as_default() as g: x = tfs.block(df, col_name) invs = tf.inv(tf.to_double(x), name="invs") df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df) # The geometric mean
import logging logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) import tensorframes as tfs import tensorflow as tf from pyspark.sql import Row from pyspark.sql.functions import * from pyspark.sql.types import DoubleType, IntegerType, LongType, FloatType from tensorframes.core import _java_api japi = _java_api() _java_api().initialize_logging() # The input data data = [Row(x=[float(x), float(2 * x)], key=str(x % 2)) for x in range(1, 6)] df = sqlContext.createDataFrame(data) df = tfs.analyze(sqlContext.createDataFrame(data)) # The geometric mean: # TODO(tjh) make a test out of this, it found some bugs # - non numeric columns (string) # - unused columns # - output that has a child col_name = "x" col_key = "key" with tf.Graph().as_default() as g: x = tfs.block(df, col_name) invs = tf.inv(tf.to_double(x), name="invs") df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df) # The geometric mean gb = df2.select(col_key, "invs", "count").groupBy("key")
def m_kmeans(df, feature_column, num_centroids_each, num_features, m_groups, max_iter=10): """ M K-means algorithm applied on a dataframe of points :param df: dataframe, contains all points :param feature_column: string, the points column within df :param num_centroids_each: int, k clusters :param num_features: int, dimension of a point vector :param m_groups: int, number of groups a point is spitted into :param max_iter: int, maximum number of iterations :return: numpy.array: [num_centroids, num_features], the k cluster centers with m groups concatenated """ initial_centers = df.select(feature_column).take(num_centroids_each) centers = np.array(initial_centers).reshape(num_centroids_each, num_features) m_slice = map(lambda r: slice(min(r), max(r) + 1), np.array_split(xrange(num_features), m_groups)) slices = np.array_split(xrange(m_groups * num_centroids_each), m_groups) df = tfs.analyze(df) while max_iter > 0: max_iter -= 1 with tf.Graph().as_default(): points = tf.placeholder(tf.double, shape=[None, num_features], name=feature_column) counts, vector_sums = calculate_new_centers_for_m_slice( m_slice, points, tf.nn.l2_normalize(centers, dim=1), num_centroids_each) counts = tf.identity(counts, name='counts') vector_sums = tf.identity(vector_sums, name='vector_sums') df2 = tfs.map_blocks([counts, vector_sums], df, trim=True) with tf.Graph().as_default(): counts = tf.placeholder( tf.int64, shape=[None, num_centroids_each * m_groups], name='counts_input') vector_sums = tf.placeholder(tf.double, shape=[ None, num_centroids_each * m_groups, num_features / m_groups ], name='vector_sums_input') count = tf.reduce_sum(counts, axis=0, name='counts') vector_sum = tf.reduce_sum(vector_sums, axis=0, name='vector_sums') d_count, d_vector_sum = tfs.reduce_blocks([count, vector_sum], df2) new_centers = d_vector_sum / (d_count[:, np.newaxis] + 1e-7) new_centers = np.concatenate([new_centers[i] for i in slices], axis=1) if np.allclose(centers, new_centers): break else: centers = new_centers return new_centers
# The number of clusters k = 10 num_points = 100000 num_iters = 10 FEATURES_COL = "features" np.random.seed(2) np_data = [x.tolist() for x in np.random.uniform(0.0, 1.0, size=(num_points, num_features))] schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)]) mllib_rows = [Row(_convert_to_vector(x)) for x in np_data] mllib_df = sqlContext.createDataFrame(mllib_rows, schema).coalesce(1).cache() df = sqlContext.createDataFrame([[r] for r in np_data]).toDF(FEATURES_COL).coalesce(1) # For now, analysis is still required. We cache the output because we are going to perform # multiple runs on the dataset. df0 = tfs.analyze(df).cache() mllib_df.count() df0.count() np.random.seed(2) init_centers = np.random.randn(k, num_features) start_centers = init_centers dataframe = df0 ta_0 = time.time() kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode( "random").setMaxIter(num_iters) mod = kmeans.fit(mllib_df)
# The number of clusters k = 10 num_points = 100000 num_iters = 10 FEATURES_COL = "features" np.random.seed(2) np_data = [x.tolist() for x in np.random.uniform(0.0, 1.0, size=(num_points, num_features))] schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)]) mllib_rows = [Row(_convert_to_vector(x)) for x in np_data] mllib_df = sqlContext.createDataFrame(mllib_rows, schema).coalesce(1).cache() df = sqlContext.createDataFrame([[r] for r in np_data]).toDF(FEATURES_COL).coalesce(1) # For now, analysis is still required. We cache the output because we are going to perform # multiple runs on the dataset. df0 = tfs.analyze(df).cache() mllib_df.count() df0.count() np.random.seed(2) init_centers = np.random.randn(k, num_features) start_centers = init_centers dataframe = df0 ta_0 = time.time() kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode( "random").setMaxIter(num_iters) mod = kmeans.fit(mllib_df) ta_1 = time.time()