def _transform(self, dataset): graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [ tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping ] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = { self._getSparkDlOpName(tnsr_name): col_name for col_name, tnsr_name in input_mapping } fetches = [ tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names ] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
def assign_center(df, feature_column, residual_column, assigned_coarse_column, assigned_pq_column, coarse_centers, pq_centers, m): """ Assign the points into corresponding indexes :param df: dataframe, contains all points :param feature_column: string, the points column within df :param residual_column: string, the points residual column to be saved :param assigned_coarse_column: string, the output column name for coarse index. :param assigned_pq_column: string, the output column name for pq indexes. :param coarse_centers: numpy.array, [num_centroids, num_features] the coarse cluster centers :param pq_centers: numpy.array, [num_centroids, num_features] the pq cluster centers :param m: int, number of groups a point is spitted into for pq :return: dataframe, contains two extra columns, `assigned_coarse_column` and `assigned_pq_column` """ df = residual_of_closest(df, feature_column, residual_column, coarse_centers) num_features = coarse_centers.shape[1] with tf.Graph().as_default(): points = tf.placeholder(tf.double, shape=[None, num_features], name=feature_column) residuals = tf.placeholder(tf.double, shape=[None, num_features], name=residual_column) assigned_coarse = _assign_center(points, coarse_centers, assigned_coarse_column) assigned_pq = _assign_center(residuals, pq_centers, assigned_pq_column, m) return tfs.map_blocks([assigned_coarse, assigned_pq], df)
def _transform(self, dataset): if len([field for field in dataset.schema if field.dataType == DoubleType()]) > 0: logger.warn("Detected DoubleType columns in dataframe passed to transform(). In " "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be " "fed to input tensors of type tf.float64. To feed dataframe data to " "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the " "corresponding Spark SQL data types (FloatType, IntegerType, LongType).") graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping} fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
def _transform(self, dataset): if any([field.dataType == DoubleType() for field in dataset.schema]): logger.warning("Detected DoubleType columns in dataframe passed to transform(). In " "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be " "fed to input tensors of type tf.float64. To feed dataframe data to " "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the " "corresponding Spark SQL data types (FloatType, IntegerType, LongType).") graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping} fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
def residual_of_closest(df, feature_column, residual_column, centers, assigned_column='assigned'): """ Residual between points and their closest center :param df: dataframe, contains all points :param feature_column: string, the points column within df :param residual_column: string, the output column name for residual error between closest center. :param centers: numpy.array, [num_centroids, num_features] the k cluster centers :param assigned_column: string, the output column name for index of closest center. :return: dataframe, contains two extra columns, `residual_column`, `assigned_column` """ df = tfs.analyze(df) num_features = centers.shape[1] with tf.Graph().as_default(): points = tf.placeholder(tf.double, shape=[None, num_features], name=feature_column) assigned = _assign_center(points, centers) residual = _residual_of_assigned(points, assigned, centers, residual_column) return tfs.map_blocks([assigned, residual], df)
def run_one_step(dataframe, start_centers): """ Performs one iteration of K-Means. This function takes a dataframe with dense feature vectors, a set of centroids, and returns a new set of centroids along with the total distance of points to centroids. This function calculates for each point the closest centroid and then aggregates the newly formed clusters to find the new centroids. This function uses Spark to distribute the aggregation amongst the node. :param dataframe: a dataframe containing a column of features (an array of doubles) :param start_centers: a k x m matrix with k the number of centroids and m the number of features :return: a k x m matrix, and a positive double """ # The dimensions in the problem (num_centroids, num_features) = np.shape(start_centers) # For each feature vector, compute the nearest centroid and the distance to that centroid. # The index of the nearest centroid is stored in the 'indexes' column. # We also add a column of 1's that will be reduced later to count the number of elements in # each cluster. with tf.Graph().as_default() as g: # The placeholder for the input: we use the block format points = tf.placeholder(tf.double, shape=[None, num_features], name='features') # The shape of the block is extracted as a TF variable. num_points = tf.stack([tf.shape(points)[0]], name="num_points") distances = tf_compute_distances(points, start_centers) # The outputs of the program. # The closest centroids are extracted. indexes = tf.argmin(distances, 1, name='indexes') # This could be done based on the indexes as well. min_distances = tf.reduce_min(distances, 1, name='min_distances') counts = tf.tile(tf.constant([1]), num_points, name='count') df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe) # Perform the reduction: we regroup the points by their centroid indexes. gb = df2.groupBy("indexes") with tf.Graph().as_default() as g: # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders. x_input = tfs.block(df2, "features", tf_name="features_input") count_input = tfs.block(df2, "count", tf_name="count_input") md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input") # Each operation is just the sum. x = tf.reduce_sum(x_input, [0], name='features') count = tf.reduce_sum(count_input, [0], name='count') min_distances = tf.reduce_sum(md_input, [0], name='min_distances') df3 = tfs.aggregate([x, count, min_distances], gb) # Get the new centroids df3_c = df3.collect() # The new centroids. new_centers = np.array( [np.array(row.features) / row['count'] for row in df3_c]) total_distances = np.sum([row['min_distances'] for row in df3_c]) return (new_centers, total_distances)
def run_one_step(dataframe, start_centers): """ Performs one iteration of K-Means. This function takes a dataframe with dense feature vectors, a set of centroids, and returns a new set of centroids along with the total distance of points to centroids. This function calculates for each point the closest centroid and then aggregates the newly formed clusters to find the new centroids. This function uses Spark to distribute the aggregation amongst the node. :param dataframe: a dataframe containing a column of features (an array of doubles) :param start_centers: a k x m matrix with k the number of centroids and m the number of features :return: a k x m matrix, and a positive double """ # The dimensions in the problem (num_centroids, num_features) = np.shape(start_centers) # For each feature vector, compute the nearest centroid and the distance to that centroid. # The index of the nearest centroid is stored in the 'indexes' column. # We also add a column of 1's that will be reduced later to count the number of elements in # each cluster. with tf.Graph().as_default() as g: # The placeholder for the input: we use the block format points = tf.placeholder(tf.double, shape=[None, num_features], name='features') # The shape of the block is extracted as a TF variable. num_points = tf.stack([tf.shape(points)[0]], name="num_points") distances = tf_compute_distances(points, start_centers) # The outputs of the program. # The closest centroids are extracted. indexes = tf.argmin(distances, 1, name='indexes') # This could be done based on the indexes as well. min_distances = tf.reduce_min(distances, 1, name='min_distances') counts = tf.tile(tf.constant([1]), num_points, name='count') df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe) # Perform the reduction: we regroup the points by their centroid indexes. gb = df2.groupBy("indexes") with tf.Graph().as_default() as g: # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders. x_input = tfs.block(df2, "features", tf_name="features_input") count_input = tfs.block(df2, "count", tf_name="count_input") md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input") # Each operation is just the sum. x = tf.reduce_sum(x_input, [0], name='features') count = tf.reduce_sum(count_input, [0], name='count') min_distances = tf.reduce_sum(md_input, [0], name='min_distances') df3 = tfs.aggregate([x, count, min_distances], gb) # Get the new centroids df3_c = df3.collect() # The new centroids. new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c]) total_distances = np.sum([row['min_distances'] for row in df3_c]) return (new_centers, total_distances)
def test_map_blocks_1(self): data = [Row(x=float(x)) for x in range(10)] df = self.sql.createDataFrame(data) with tf.Graph().as_default() as g: # The placeholder that corresponds to column 'x' x = tf.placeholder(tf.double, shape=[None], name="x") # The output that adds 3 to x z = tf.add(x, 3, name='z') # The resulting dataframe df2 = tfs.map_blocks(z, df) data2 = df2.collect() assert data2[0].z == 3.0, data2
def test_map_blocks_trimmed_1(self): data = [Row(x=float(x)) for x in range(3)] df = self.sql.createDataFrame(data) with tf.Graph().as_default() as g: # The placeholder that corresponds to column 'x' x = tf.placeholder(tf.double, shape=[None], name="x") # The output discards the input and return a single row of data z = tf.constant([2], name='z') # The resulting dataframe df2 = tfs.map_blocks(z, df, trim=True) data2 = df2.collect() assert data2[0].z == 2, data2
def test_map_blocks_2(self): data = [dict(x=float(x)) for x in range(10)] df = pd.DataFrame(data) with tf.Graph().as_default() as g: # The placeholder that corresponds to column 'x' x = tf.placeholder(tf.double, shape=[None], name="x") # The output that adds 3 to x z = tf.add(x, 3, name='z') # The resulting dataframe df2 = tfs.map_blocks(z, df) data2 = df2 assert data2.z[0] == 3.0, data2
def test_map_blocks_0(self): data = [Row(x=float(x)) for x in range(10)] df = self.sql.createDataFrame(data) with tf.Graph().as_default() as g: # The placeholder that corresponds to column 'x' x = tf.placeholder(tf.double, shape=[None], name="x") # The output that adds 3 to x y = tf.Variable(3.0, dtype=tf.double, name='y') z = tf.add(x, y, name='z') # The resulting dataframe df2 = tfs.map_blocks(z, df) data2 = df2.collect() assert data2[0].z == 3.0, data2
def covariance(df, feature_column, num_features, coarse_center): with tf.Graph().as_default(): features = tf.placeholder(tf.double, [None, num_features], name=feature_column) count = tf.identity(tf.ones_like(features)[:, 0], name='count') out = tf.identity(tf.map_fn(lambda x: tf.einsum('i,j->ij', x, x), features, dtype=tf.double), name='out') df1 = tfs.map_blocks(out, df) with tf.Graph().as_default(): features = tf.placeholder(tf.double, [None, num_features], name=feature_column + '_input') out = tf.placeholder(tf.double, [None, num_features, num_features], name='out_input') count = tf.placeholder(tf.double, [None], name='count_input') expected_mean = tf.identity(tf.reduce_sum(features, axis=0), name=feature_column) expected_out = tf.identity(tf.reduce_sum(out, axis=0), name='out') expected_count = tf.identity(tf.reduce_sum(count, axis=0), name='count') df2 = tfs.aggregate([expected_mean, expected_out, expected_count], df1.groupby(coarse_center)) with tf.Graph().as_default(): features = tf.placeholder(tf.double, [None, num_features], name=feature_column) out = tf.placeholder(tf.double, [None, num_features, num_features], name='out') count = tf.placeholder(tf.double, [None], name='count') covariance = tf.identity(tf.map_fn( lambda (f, o, c): (o + tf.transpose(o)) / (2 * c - 2) - tf.einsum('i,j->ij', f, f), (features, out, count), dtype=tf.double), name='covariance') df3 = tfs.map_blocks(covariance, df2) return df3
def tf_serving_with_dataframe(df, model_base_path, model_version=None): """ :param df: spark dataframe, batch input for the model :param model_base_path: str, tensorflow saved Model model base path :param model_version: int, tensorflow saved Model model version, default None :return: spark dataframe, with predicted result. """ import tensorframes as tfs g, feed_tensors, fetch_tensors = load_model(model_base_path, model_version) with g.as_default(): df = rename_by_mapping(df, feed_tensors) df = tfs.analyze(df) df = tfs.map_blocks(fetch_tensors.values(), df) df = rename_by_mapping(df, feed_tensors, reverse=True) return rename_by_mapping(df, fetch_tensors, reverse=True)
def simple_example_1(): spark = SparkSession.builder.appName( 'simple-tensorframes-example-1').getOrCreate() spark.sparkContext.setLogLevel('WARN') rdd = [Row(x=float(x)) for x in range(10)] df = spark.createDataFrame(rdd) df.show() # Execute the tensor graph. with tf.Graph().as_default() as graph: # A block placeholder. x = tfs.block(df, 'x') z = tf.add(x, 3, name='z') # Tensor -> dataframe. df2 = tfs.map_blocks(z, df) print('z =', z) df2.show()
centers = tf.constant(init_centers) squares = tf.reduce_sum(tf.square(points), reduction_indices=1) center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1) prods = tf.matmul(points, centers, transpose_b = True) t1a = tf.expand_dims(center_squares, 0) t1b = tf.pack([num_points, 1]) t1 = tf.tile(t1a, t1b) t2a = tf.expand_dims(squares, 1) t2b = tf.pack([1, k]) t2 = tf.tile(t2a, t2b) distances = t1 + t2 - 2 * prods # TODO cast indexes = tf.argmin(distances, 1, name='indexes') min_distances = tf.reduce_min(distances, 1, name='min_distances') counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count') df2 = tfs.map_blocks([indexes, counts, min_distances], df0) # Perform the reduction gb = df2.groupBy("indexes") with tf.Graph().as_default() as g: # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders. x_input = tfs.block(df2, "features", tf_name="features_input") count_input = tfs.block(df2, "count", tf_name="count_input") md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input") x = tf.reduce_sum(x_input, [0], name='features') count = tf.reduce_sum(count_input, [0], name='count') min_distances = tf.reduce_sum(md_input, [0], name='min_distances') df3 = tfs.aggregate([x, count, min_distances], gb) # Get the new centroids df3_c = df3.collect()
from tensorframes.core import _java_api japi = _java_api() _java_api().initialize_logging() data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)] df = sqlContext.createDataFrame(data) tfs.block(df, "x") data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)] df = sqlContext.createDataFrame(data) gb = df.groupBy("key") with tf.Graph().as_default() as g: x_input = tfs.block(df, "x", tf_name="x_input") x = tf.reduce_sum(x_input, [0], name='x') df2 = tfs.aggregate(x, gb) data = [Row(x=float(x)) for x in range(5)] df = sqlContext.createDataFrame(data) with tf.Graph().as_default() as g: # The placeholder that corresponds to column 'x' x = tf.placeholder(tf.double, shape=[None], name="x") # The output that adds 3 to x z = tf.add(x, 3, name='z') # The resulting dataframe df2 = tfs.map_blocks(z, df) df2.show()
def run_one_step2(dataframe, start_centers): """ Performs one iteration of K-Means. This function takes a dataframe with dense feature vectors, a set of centroids, and returns a new set of centroids along with the total distance of points to centroids. This function calculates for each point the closest centroid and then aggregates the newly formed clusters to find the new centroids. This function performs most of the aggregation in TensorFlow. :param dataframe: a dataframe containing a column of features (an array of doubles) :param start_centers: a k x m matrix with k the number of centroids and m the number of features :return: a k x m matrix, and a positive double """ # The dimensions in the problem (num_centroids, _) = np.shape(start_centers) # For each feature vector, compute the nearest centroid and the distance to that centroid. # The index of the nearest centroid is stored in the 'indexes' column. # We also add a column of 1's that will be reduced later to count the number of elements in # each cluster. with tf.Graph().as_default() as g: # The placeholder for the input: we use the block format points = tf.placeholder(tf.double, shape=[None, num_features], name='features') # The distances distances = tf_compute_distances(points, start_centers) # The rest of this block performs a pre-aggregation step in TF, to limit the # communication between TF and Spark. # The closest centroids are extracted. indexes = tf.argmin(distances, 1, name='indexes') min_distances = tf.reduce_min(distances, 1, name='min_distances') num_points = tf.pack([tf.shape(points)[0]], name="num_points") counts = tf.tile(tf.constant([1]), num_points, name='count') # These compute the aggregate based on the indexes. block_points = tf.unsorted_segment_sum(points, indexes, num_centroids, name="block_points") block_counts = tf.unsorted_segment_sum(counts, indexes, num_centroids, name="block_counts") block_distances = tf.reduce_sum(min_distances, name="block_distances") # One leading dimension is added to express the fact that the previous elements are just # one row in the final dataframe. # The final dataframe has one row per block. agg_points = tf.expand_dims(block_points, 0, name="agg_points") agg_counts = tf.expand_dims(block_counts, 0, name="agg_counts") agg_distances = tf.expand_dims(block_distances, 0, name="agg_distances") # Using trimming to drop the original data (we are just returning one row of data per # block). df2 = tfs.map_blocks([agg_points, agg_counts, agg_distances], dataframe, trim=True) # Now we simply collect and sum the elements with tf.Graph().as_default() as g: # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders. x_input = tf.placeholder(tf.double, shape=[None, num_centroids, num_features], name='agg_points_input') count_input = tf.placeholder(tf.int32, shape=[None, num_centroids], name='agg_counts_input') md_input = tf.placeholder(tf.double, shape=[None], name='agg_distances_input') # Each operation is just the sum. x = tf.reduce_sum(x_input, [0], name='agg_points') count = tf.reduce_sum(count_input, [0], name='agg_counts') min_distances = tf.reduce_sum(md_input, [0], name='agg_distances') (x_, count_, total_distances) = tfs.reduce_blocks([x, count, min_distances], df2) # The new centers new_centers = (x_.T / (count_ + 1e-7)).T return (new_centers, total_distances)
def run_one_step(dataframe, start_centers): """ Performs one iteration of K-Means. This function takes a dataframe with dense feature vectors, a set of centroids, and returns a new set of centroids along with the total distance of points to centroids. This function calculates for each point the closest centroid and then aggregates the newly formed clusters to find the new centroids. :param dataframe: a dataframe containing a column of features (an array of doubles) :param start_centers: a k x m matrix with k the number of centroids and m the number of features :return: a k x m matrix, and a positive double """ # The dimensions in the problem (num_centroids, num_features) = np.shape(start_centers) # For each feature vector, compute the nearest centroid and the distance to that centroid. # The index of the nearest centroid is stored in the 'indexes' column. # We also add a column of 1's that will be reduced later to count the number of elements in # each cluster. with tf.Graph().as_default() as g: # The placeholder for the input: we use the block format points = tf.placeholder(tf.double, shape=[None, num_features], name='features') # The shape of the block is extracted as a TF variable. num_points = tf.shape(points)[0] # The centers are embedded in the TF program. centers = tf.constant(start_centers) # Computation of the minimum distance. This is a standard implementation that follows # what MLlib does. squares = tf.reduce_sum(tf.square(points), reduction_indices=1) center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1) prods = tf.matmul(points, centers, transpose_b = True) # This code simply expresses two outer products: center_squares * ones(num_points) # and ones(num_centroids) * squares t1a = tf.expand_dims(center_squares, 0) t1b = tf.pack([num_points, 1]) t1 = tf.tile(t1a, t1b) t2a = tf.expand_dims(squares, 1) t2b = tf.pack([1, num_centroids]) t2 = tf.tile(t2a, t2b) distances = t1 + t2 - 2 * prods # The outputs of the program. # The closest centroids are extracted. indexes = tf.argmin(distances, 1, name='indexes') # This could be done based on the indexes as well. min_distances = tf.reduce_min(distances, 1, name='min_distances') counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count') df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe) # Perform the reduction: we regroup the point by their centroid indexes. gb = df2.groupBy("indexes") with tf.Graph().as_default() as g: # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders. x_input = tfs.block(df2, "features", tf_name="features_input") count_input = tfs.block(df2, "count", tf_name="count_input") md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input") # Each operation is just the sum. x = tf.reduce_sum(x_input, [0], name='features') count = tf.reduce_sum(count_input, [0], name='count') min_distances = tf.reduce_sum(md_input, [0], name='min_distances') df3 = tfs.aggregate([x, count, min_distances], gb) # Get the new centroids df3_c = df3.collect() # The new centroids. new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c]) total_distances = np.sum([row['min_distances'] for row in df3_c]) return (new_centers, total_distances)
import tensorflow as tf import tensorframes as tfs from pyspark.sql import Row data = [Row(x=float(x)) for x in range(10)] df = sqlContext.createDataFrame(data) with tf.Graph().as_default() as g: # The TensorFlow placeholder that corresponds to column 'x'. # The shape of the placeholder is automatically inferred from the DataFrame. x = tfs.block(df, "x") # The output that adds 3 to x z = tf.add(x, 3, name='z') # The resulting dataframe df2 = tfs.map_blocks(z, df) # The transform is lazy as for most DataFrame operations. This will trigger it: df2.collect()
# The input data data = [Row(x=[float(x), float(2 * x)], key=str(x % 2)) for x in range(1, 6)] df = sqlContext.createDataFrame(data) df = tfs.analyze(sqlContext.createDataFrame(data)) # The geometric mean: # TODO(tjh) make a test out of this, it found some bugs # - non numeric columns (string) # - unused columns # - output that has a child col_name = "x" col_key = "key" with tf.Graph().as_default() as g: x = tfs.block(df, col_name) invs = tf.inv(tf.to_double(x), name="invs") df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df) # The geometric mean gb = df2.select(col_key, "invs", "count").groupBy("key") with tf.Graph().as_default() as g: x_input = tfs.block(df2, "invs", tf_name="invs_input") count_input = tfs.block(df2, "invs", tf_name="count_input") x = tf.reduce_sum(x_input, [0], name='invs') count = tf.reduce_sum(count_input, [0], name='count') df3 = tfs.aggregate([x, count], gb) with tf.Graph().as_default() as g: invs = tfs.block(df2, "invs") count = tfs.block(df2, "count") geom_mean = tf.div(tf.to_double(count), invs, name = "harmonic_mean")
# The input data data = [Row(x=[float(x), float(2 * x)], key=str(x % 2)) for x in range(1, 6)] df = sqlContext.createDataFrame(data) df = tfs.analyze(sqlContext.createDataFrame(data)) # The geometric mean: # TODO(tjh) make a test out of this, it found some bugs # - non numeric columns (string) # - unused columns # - output that has a child col_name = "x" col_key = "key" with tf.Graph().as_default() as g: x = tfs.block(df, col_name) invs = tf.inv(tf.to_double(x), name="invs") df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df) # The geometric mean gb = df2.select(col_key, "invs", "count").groupBy("key") with tf.Graph().as_default() as g: x_input = tfs.block(df2, "invs", tf_name="invs_input") count_input = tfs.block(df2, "invs", tf_name="count_input") x = tf.reduce_sum(x_input, [0], name='invs') count = tf.reduce_sum(count_input, [0], name='count') df3 = tfs.aggregate([x, count], gb) with tf.Graph().as_default() as g: invs = tfs.block(df2, "invs") count = tfs.block(df2, "count") geom_mean = tf.div(tf.to_double(count), invs, name="harmonic_mean") df4 = tfs.map_blocks(geom_mean, df3).select("key", "harmonic_mean")
def infer(df, model_file=None, aggregate=True): """ Predict. Expects the dataframe to contains a column called `coordinates` of the data type array<array<double>>. Parameters ---------- model_file: String. Path to a .pb tensorflow model file. df: A pyspark.sql.dataframe.DataFrame. Expects a column called `coordinates` of array<array<double>> type. model_file: String. Full path to a model .pb file. Expects a correspoinding metadata json file in the same directory. If not provided, loads the sample model. aggregate: Boolean. Whether to aggregate piece-wise results into a prediction for the full trace. Returns ------- A pyspark.sql.dataframe.DataFrame with two extra columns `probas` (array<double>) for the probabilities of each class, and `pred_modality` (string) for the class that has the highest probability. """ # Use sample model if a model is not provided. if model_file is None: dir, _ = os.path.split(__file__) model_file = os.path.join( dir, "sample_model/sample_model_optimised_frozen.pb") # Load model metadata metadata = load_model_metadata(model_file) assert metadata is not None # Preprocess data with_ids_and_labels_df = include_id_and_label( df) # To be joined with prediction with_ids_and_labels_df.persist() with_word_vecs_df, _, _ = include_word_vecs(with_ids_and_labels_df, metadata) with_phrases_df = create_phrases( with_word_vecs_df, MODEL_INPUT_CONFIG["WORD_VEC_COL"], MODEL_INPUT_CONFIG["ID_COL"], MODEL_INPUT_CONFIG["WORD_POS_COL"], desired_phrase_length=metadata["desired_phrase_length"], ) with_phrases_df.persist() # Read in serialized tensorflow graph with tf.gfile.FastGFile(model_file, "rb") as f: model_graph = f.read() with tf.Graph().as_default() as g: # Reconstruct tf graph (parse serialised graph) graph_def = tf.GraphDef() graph_def.ParseFromString(model_graph) input_op_name = [ n.name for n in graph_def.node if n.op.startswith("Placeholder") and n.name.startswith("input") ][0] output_op_name = [ n.name for n in graph_def.node if n.op.startswith("Softmax") and n.name.startswith("output") ][0] # Add metadata on the input size to the dataframe for tensorframes input_shape = [None, *metadata["input_shape"]] model_input_df = tfs.append_shape( with_phrases_df, with_phrases_df[MODEL_INPUT_CONFIG["INPUT_COL"]], shape=input_shape, ) # Load graph [input_op, output_op] = tf.import_graph_def( graph_def, return_elements=[input_op_name, output_op_name]) # Predict model_output_df = tfs.map_blocks( output_op.outputs, model_input_df, feed_dict={input_op.name: MODEL_INPUT_CONFIG["INPUT_COL"]}, ) # Rename column output_col = list( set(model_output_df.columns) - set(with_phrases_df.columns))[ 0] # Something like 'import/output/Softmax', but might change phrasewise_res_df = model_output_df.withColumnRenamed( output_col, "probas").withColumn("pred_label", argmax(col("probas"))) if aggregate: phrasewise_res_df.persist() # Average piece-wise probabilities into full-trace probabilities, and # find the label with the highest probability. with_avg_prob_df = avg_probability(phrasewise_res_df, "id", "probas", len(metadata["classes"])) # Convert integer labels into string classes with_predicted_labels_df = reverse_create_label( with_avg_prob_df, "sentence_pred_label", "pred_modality", metadata["classes"], ).withColumnRenamed("sentence_probas", "probas") # Join prediction with the original dataframe to get the coordinates # Left join to handle edge case in which trace has fewer than three res_df = with_ids_and_labels_df.join( with_predicted_labels_df, on=MODEL_INPUT_CONFIG["ID_COL"], how="left") else: # TO-DO: return pieces of coordinates rather than phrases res_df = reverse_create_label(phrasewise_res_df, "pred_label", "pred_modality", metadata["classes"]) # clean up with_ids_and_labels_df.unpersist() with_phrases_df.unpersist() phrasewise_res_df.unpersist() n_pre_infer = df.count() n_post_infer = res_df.count() if (n_pre_infer is not n_post_infer) and aggregate == True: raise Exception("Some traces dropped during inference!") res_df.persist() return res_df.withColumn( "probas", coalesce(col("probas"), array([lit(0.0), lit(0.0), lit(0.0)]))).withColumn( "pred_modality", coalesce(col("pred_modality"), lit("NA")))
def run_one_step2(dataframe, start_centers): """ Performs one iteration of K-Means. This function takes a dataframe with dense feature vectors, a set of centroids, and returns a new set of centroids along with the total distance of points to centroids. This function calculates for each point the closest centroid and then aggregates the newly formed clusters to find the new centroids. This function performs most of the aggregation in TensorFlow. :param dataframe: a dataframe containing a column of features (an array of doubles) :param start_centers: a k x m matrix with k the number of centroids and m the number of features :return: a k x m matrix, and a positive double """ # The dimensions in the problem (num_centroids, _) = np.shape(start_centers) # For each feature vector, compute the nearest centroid and the distance to that centroid. # The index of the nearest centroid is stored in the 'indexes' column. # We also add a column of 1's that will be reduced later to count the number of elements in # each cluster. with tf.Graph().as_default() as g: # The placeholder for the input: we use the block format points = tf.placeholder(tf.double, shape=[None, num_features], name='features') # The distances distances = tf_compute_distances(points, start_centers) # The rest of this block performs a pre-aggregation step in TF, to limit the # communication between TF and Spark. # The closest centroids are extracted. indexes = tf.argmin(distances, 1, name='indexes') min_distances = tf.reduce_min(distances, 1, name='min_distances') num_points = tf.stack([tf.shape(points)[0]], name="num_points") counts = tf.tile(tf.constant([1]), num_points, name='count') # These compute the aggregate based on the indexes. block_points = tf.unsorted_segment_sum(points, indexes, num_centroids, name="block_points") block_counts = tf.unsorted_segment_sum(counts, indexes, num_centroids, name="block_counts") block_distances = tf.reduce_sum(min_distances, name="block_distances") # One leading dimension is added to express the fact that the previous elements are just # one row in the final dataframe. # The final dataframe has one row per block. agg_points = tf.expand_dims(block_points, 0, name="agg_points") agg_counts = tf.expand_dims(block_counts, 0, name="agg_counts") agg_distances = tf.expand_dims(block_distances, 0, name="agg_distances") # Using trimming to drop the original data (we are just returning one row of data per # block). df2 = tfs.map_blocks([agg_points, agg_counts, agg_distances], dataframe, trim=True) # Now we simply collect and sum the elements with tf.Graph().as_default() as g: # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders. x_input = tf.placeholder(tf.double, shape=[None, num_centroids, num_features], name='agg_points_input') count_input = tf.placeholder(tf.int32, shape=[None, num_centroids], name='agg_counts_input') md_input = tf.placeholder(tf.double, shape=[None], name='agg_distances_input') # Each operation is just the sum. x = tf.reduce_sum(x_input, [0], name='agg_points') count = tf.reduce_sum(count_input, [0], name='agg_counts') min_distances = tf.reduce_sum(md_input, [0], name='agg_distances') (x_, count_, total_distances) = tfs.reduce_blocks([x, count, min_distances], df2) # The new centers new_centers = (x_.T / (count_ + 1e-7)).T return (new_centers, total_distances)
centers = tf.constant(init_centers) squares = tf.reduce_sum(tf.square(points), reduction_indices=1) center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1) prods = tf.matmul(points, centers, transpose_b=True) t1a = tf.expand_dims(center_squares, 0) t1b = tf.pack([num_points, 1]) t1 = tf.tile(t1a, t1b) t2a = tf.expand_dims(squares, 1) t2b = tf.pack([1, k]) t2 = tf.tile(t2a, t2b) distances = t1 + t2 - 2 * prods # TODO cast indexes = tf.argmin(distances, 1, name='indexes') min_distances = tf.reduce_min(distances, 1, name='min_distances') counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count') df2 = tfs.map_blocks([indexes, counts, min_distances], df0) # Perform the reduction gb = df2.groupBy("indexes") with tf.Graph().as_default() as g: # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders. x_input = tfs.block(df2, "features", tf_name="features_input") count_input = tfs.block(df2, "count", tf_name="count_input") md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input") x = tf.reduce_sum(x_input, [0], name='features') count = tf.reduce_sum(count_input, [0], name='count') min_distances = tf.reduce_sum(md_input, [0], name='min_distances') df3 = tfs.aggregate([x, count, min_distances], gb) # Get the new centroids df3_c = df3.collect()
def m_kmeans(df, feature_column, num_centroids_each, num_features, m_groups, max_iter=10): """ M K-means algorithm applied on a dataframe of points :param df: dataframe, contains all points :param feature_column: string, the points column within df :param num_centroids_each: int, k clusters :param num_features: int, dimension of a point vector :param m_groups: int, number of groups a point is spitted into :param max_iter: int, maximum number of iterations :return: numpy.array: [num_centroids, num_features], the k cluster centers with m groups concatenated """ initial_centers = df.select(feature_column).take(num_centroids_each) centers = np.array(initial_centers).reshape(num_centroids_each, num_features) m_slice = map(lambda r: slice(min(r), max(r) + 1), np.array_split(xrange(num_features), m_groups)) slices = np.array_split(xrange(m_groups * num_centroids_each), m_groups) df = tfs.analyze(df) while max_iter > 0: max_iter -= 1 with tf.Graph().as_default(): points = tf.placeholder(tf.double, shape=[None, num_features], name=feature_column) counts, vector_sums = calculate_new_centers_for_m_slice( m_slice, points, tf.nn.l2_normalize(centers, dim=1), num_centroids_each) counts = tf.identity(counts, name='counts') vector_sums = tf.identity(vector_sums, name='vector_sums') df2 = tfs.map_blocks([counts, vector_sums], df, trim=True) with tf.Graph().as_default(): counts = tf.placeholder( tf.int64, shape=[None, num_centroids_each * m_groups], name='counts_input') vector_sums = tf.placeholder(tf.double, shape=[ None, num_centroids_each * m_groups, num_features / m_groups ], name='vector_sums_input') count = tf.reduce_sum(counts, axis=0, name='counts') vector_sum = tf.reduce_sum(vector_sums, axis=0, name='vector_sums') d_count, d_vector_sum = tfs.reduce_blocks([count, vector_sum], df2) new_centers = d_vector_sum / (d_count[:, np.newaxis] + 1e-7) new_centers = np.concatenate([new_centers[i] for i in slices], axis=1) if np.allclose(centers, new_centers): break else: centers = new_centers return new_centers